Repository: pramalhe/OneFile
Branch: master
Commit: 49654893f081
Files: 412
Total size: 3.5 MB

Directory structure:
gitextract_33ubg_0h/

├── LICENSE.txt
├── README.md
├── common/
│   ├── HazardEras.hpp
│   ├── HazardPointers.hpp
│   ├── HazardPointersSimQueue.hpp
│   ├── README.md
│   ├── RIStaticPerThread.hpp
│   ├── ThreadRegistry.cpp
│   ├── ThreadRegistry.hpp
│   └── pfences.h
├── datastructures/
│   ├── generic/
│   │   ├── TMHashMap.hpp
│   │   ├── TMLinkedListQueue.hpp
│   │   ├── TMLinkedListSet.hpp
│   │   └── TMRedBlackBST.hpp
│   ├── hashmaps/
│   │   ├── CRWWPSTMResizableHashSet.hpp
│   │   ├── ESTMResizableHashSet.hpp
│   │   ├── OFLFResizableHashSet.hpp
│   │   ├── OFWFResizableHashSet.hpp
│   │   └── TinySTMResizableHashSet.hpp
│   ├── linkedlists/
│   │   ├── CRWWPLinkedListSet.hpp
│   │   ├── ESTMLinkedListSet.hpp
│   │   ├── HazardEras.hpp
│   │   ├── HazardPointers.hpp
│   │   ├── MagedHarrisLinkedListSetHE.hpp
│   │   ├── MagedHarrisLinkedListSetHP.hpp
│   │   ├── OFLFLinkedListSet.hpp
│   │   ├── OFWFLinkedListSet.hpp
│   │   ├── STMLinkedListSet.hpp
│   │   └── TinySTMLinkedListSet.hpp
│   ├── queues/
│   │   ├── CRWWPLinkedListQueue.hpp
│   │   ├── ESTMArrayLinkedListQueue.hpp
│   │   ├── ESTMLinkedListQueue.hpp
│   │   ├── FAAArrayQueue.hpp
│   │   ├── HazardPointers.hpp
│   │   ├── HazardPointersSimQueue.hpp
│   │   ├── LCRQueue.hpp
│   │   ├── MichaelScottQueue.hpp
│   │   ├── OFLFArrayLinkedListQueue.hpp
│   │   ├── OFLFArrayQueue.hpp
│   │   ├── OFLFLinkedListQueue.hpp
│   │   ├── OFWFArrayLinkedListQueue.hpp
│   │   ├── OFWFLinkedListQueue.hpp
│   │   ├── README.md
│   │   ├── SimQueue.hpp
│   │   ├── TinySTMArrayLinkedListQueue.hpp
│   │   ├── TinySTMLinkedListQueue.hpp
│   │   └── TurnQueue.hpp
│   ├── sequential/
│   │   ├── HashSet.hpp
│   │   ├── LinkedListQueue.hpp
│   │   ├── LinkedListSet.hpp
│   │   ├── RedBlackBST.hpp
│   │   ├── SortedArraySet.hpp
│   │   ├── SortedVectorSet.hpp
│   │   └── TreeSet.hpp
│   ├── treemaps/
│   │   ├── ESTMRedBlackTree.hpp
│   │   ├── HazardEras.hpp
│   │   ├── NatarajanTreeHE.hpp
│   │   ├── OFLFRedBlackTree.hpp
│   │   ├── OFWFRedBlackTree.hpp
│   │   └── TinySTMRedBlackTree.hpp
│   ├── trevor_brown_abtree/
│   │   ├── Makefile
│   │   ├── TrevorBrownABTree.hpp
│   │   ├── common/
│   │   │   ├── atomic_ops/
│   │   │   │   ├── atomic_ops/
│   │   │   │   │   ├── generalize-small.h
│   │   │   │   │   ├── generalize.h
│   │   │   │   │   └── sysdeps/
│   │   │   │   │       ├── README
│   │   │   │   │       ├── acquire_release_volatile.h
│   │   │   │   │       ├── aligned_atomic_load_store.h
│   │   │   │   │       ├── all_acquire_release_volatile.h
│   │   │   │   │       ├── all_aligned_atomic_load_store.h
│   │   │   │   │       ├── all_atomic_load_store.h
│   │   │   │   │       ├── ao_t_is_int.h
│   │   │   │   │       ├── armcc/
│   │   │   │   │       │   └── arm_v6.h
│   │   │   │   │       ├── atomic_load_store.h
│   │   │   │   │       ├── char_acquire_release_volatile.h
│   │   │   │   │       ├── char_atomic_load_store.h
│   │   │   │   │       ├── emul_cas.h
│   │   │   │   │       ├── gcc/
│   │   │   │   │       │   ├── alpha.h
│   │   │   │   │       │   ├── arm.h
│   │   │   │   │       │   ├── avr32.h
│   │   │   │   │       │   ├── cris.h
│   │   │   │   │       │   ├── hppa.h
│   │   │   │   │       │   ├── ia64.h
│   │   │   │   │       │   ├── m68k.h
│   │   │   │   │       │   ├── mips.h
│   │   │   │   │       │   ├── powerpc.h
│   │   │   │   │       │   ├── s390.h
│   │   │   │   │       │   ├── sh.h
│   │   │   │   │       │   ├── sparc.h
│   │   │   │   │       │   ├── x86.h
│   │   │   │   │       │   └── x86_64.h
│   │   │   │   │       ├── generic_pthread.h
│   │   │   │   │       ├── hpc/
│   │   │   │   │       │   ├── hppa.h
│   │   │   │   │       │   └── ia64.h
│   │   │   │   │       ├── ibmc/
│   │   │   │   │       │   └── powerpc.h
│   │   │   │   │       ├── icc/
│   │   │   │   │       │   └── ia64.h
│   │   │   │   │       ├── int_acquire_release_volatile.h
│   │   │   │   │       ├── int_aligned_atomic_load_store.h
│   │   │   │   │       ├── int_atomic_load_store.h
│   │   │   │   │       ├── msftc/
│   │   │   │   │       │   ├── arm.h
│   │   │   │   │       │   ├── common32_defs.h
│   │   │   │   │       │   ├── x86.h
│   │   │   │   │       │   └── x86_64.h
│   │   │   │   │       ├── ordered.h
│   │   │   │   │       ├── ordered_except_wr.h
│   │   │   │   │       ├── read_ordered.h
│   │   │   │   │       ├── short_acquire_release_volatile.h
│   │   │   │   │       ├── short_aligned_atomic_load_store.h
│   │   │   │   │       ├── short_atomic_load_store.h
│   │   │   │   │       ├── standard_ao_double_t.h
│   │   │   │   │       ├── sunc/
│   │   │   │   │       │   ├── sparc.h
│   │   │   │   │       │   ├── x86.h
│   │   │   │   │       │   └── x86_64.h
│   │   │   │   │       ├── test_and_set_t_is_ao_t.h
│   │   │   │   │       └── test_and_set_t_is_char.h
│   │   │   │   └── atomic_ops.h
│   │   │   ├── dcss/
│   │   │   │   ├── dcss_plus.h
│   │   │   │   ├── dcss_plus_impl.h
│   │   │   │   └── testing.cpp
│   │   │   ├── descriptors/
│   │   │   │   ├── descriptors.h
│   │   │   │   ├── descriptors_impl.h
│   │   │   │   └── descriptors_impl2.h
│   │   │   ├── errors.h
│   │   │   ├── plaf.h
│   │   │   ├── recordmgr/
│   │   │   │   ├── allocator_bump.h
│   │   │   │   ├── allocator_interface.h
│   │   │   │   ├── allocator_new.h
│   │   │   │   ├── allocator_new_segregated.h
│   │   │   │   ├── allocator_once.h
│   │   │   │   ├── arraylist.h
│   │   │   │   ├── blockbag.h
│   │   │   │   ├── blockpool.h
│   │   │   │   ├── debug_info.h
│   │   │   │   ├── debugcounter.h
│   │   │   │   ├── debugprinting.h
│   │   │   │   ├── globals.h
│   │   │   │   ├── hashtable.h
│   │   │   │   ├── lockfreeblockbag.h
│   │   │   │   ├── pool_interface.h
│   │   │   │   ├── pool_none.h
│   │   │   │   ├── pool_perthread_and_shared.h
│   │   │   │   ├── reclaimer_debra.h
│   │   │   │   ├── reclaimer_debraplus.h
│   │   │   │   ├── reclaimer_hazardptr.h
│   │   │   │   ├── reclaimer_interface.h
│   │   │   │   ├── reclaimer_none.h
│   │   │   │   ├── reclaimer_rcu.h
│   │   │   │   ├── record_manager.h
│   │   │   │   ├── record_manager_single_type.h
│   │   │   │   └── recovery_manager.h
│   │   │   ├── rq/
│   │   │   │   ├── rq_dcssp.h
│   │   │   │   ├── rq_debugging.h
│   │   │   │   ├── rq_htm_rwlock.h
│   │   │   │   ├── rq_provider.h
│   │   │   │   ├── rq_rwlock.h
│   │   │   │   ├── rq_snapcollector.h
│   │   │   │   ├── rq_unsafe.h
│   │   │   │   └── snapcollector/
│   │   │   │       ├── reportitem.h
│   │   │   │       ├── snapcollector.h
│   │   │   │       └── snapcollector_test.cpp
│   │   │   └── rwlock.h
│   │   ├── ds/
│   │   │   └── brown_ext_abtree_lf/
│   │   │       ├── brown_ext_abtree_lf.h
│   │   │       ├── brown_ext_abtree_lf_adapter.h
│   │   │       └── brown_ext_abtree_lf_impl.h
│   │   └── minimal_example.cpp
│   └── trevor_brown_natarajan/
│       ├── TrevorBrownNatarajanTree.hpp
│       └── ds/
│           └── natarajan_ext_bst_lf/
│               ├── natarajan_ext_bst_lf_adapter.h
│               ├── natarajan_ext_bst_lf_stage1.h
│               └── natarajan_ext_bst_lf_stage2_impl.h
├── graphs/
│   ├── BenchmarkLatencyCounter.hpp
│   ├── BenchmarkLatencyQueues.hpp
│   ├── BenchmarkMaps.hpp
│   ├── BenchmarkQueues.hpp
│   ├── BenchmarkSPS.hpp
│   ├── BenchmarkSets.hpp
│   ├── Makefile
│   ├── PBenchmarkQueues.hpp
│   ├── PBenchmarkSPS.hpp
│   ├── PBenchmarkSets.hpp
│   ├── README.md
│   ├── bin/
│   │   └── .gitignore
│   ├── data/
│   │   └── README.md
│   ├── latency-counter.cpp
│   ├── lib/
│   │   └── .gitignore
│   ├── plots/
│   │   ├── caption.gp
│   │   ├── latency-counter.gp
│   │   ├── pcaption.gp
│   │   ├── plot-all.sh
│   │   ├── plot.sh
│   │   ├── pq-enq-deq.gp
│   │   ├── pq-ll-enq-deq.gp
│   │   ├── pset-hash-1k.gp
│   │   ├── pset-ll-1k.gp
│   │   ├── pset-tree-1k.gp
│   │   ├── pset-tree-1m.gp
│   │   ├── psps-integer.gp
│   │   ├── q-array-enq-deq.gp
│   │   ├── q-enq-deq.gp
│   │   ├── q-ll-enq-deq.gp
│   │   ├── set-hash-1k.gp
│   │   ├── set-ll-10k.gp
│   │   ├── set-ll-1k.gp
│   │   ├── set-tree-10k.gp
│   │   ├── set-tree-1k.gp
│   │   ├── sps-integer.gp
│   │   ├── sps-object.gp
│   │   ├── stress-multi-process-q.gp
│   │   └── styles.inc
│   ├── pq-ll-enq-deq.cpp
│   ├── pread-while-writing.cpp
│   ├── pset-hash-1k.cpp
│   ├── pset-ll-10k.cpp
│   ├── pset-ll-1k.cpp
│   ├── pset-tree-1k.cpp
│   ├── pset-tree-1m.cpp
│   ├── psps-integer.cpp
│   ├── q-array-enq-deq.cpp
│   ├── q-ll-enq-deq.cpp
│   ├── run-all-aws.sh
│   ├── set-hash-1k.cpp
│   ├── set-ll-10k.cpp
│   ├── set-ll-1k.cpp
│   ├── set-tree-10k.cpp
│   ├── set-tree-1k.cpp
│   ├── set-tree-1m.cpp
│   ├── sps-integer.cpp
│   └── sps-object.cpp
├── pdatastructures/
│   ├── README.md
│   ├── TMHashMap.hpp
│   ├── TMHashMapByRef.hpp
│   ├── TMLinkedListQueue.hpp
│   ├── TMLinkedListSet.hpp
│   ├── TMLinkedListSetByRef.hpp
│   ├── TMRedBlackTree.hpp
│   ├── TMRedBlackTreeByRef.hpp
│   └── pqueues/
│       ├── HazardPointers.hpp
│       ├── MichaelScottQueue.hpp
│       ├── PFriedmanQueue.hpp
│       ├── PMDKLinkedListQueue.hpp
│       ├── PMichaelScottQueue.hpp
│       ├── POFLFLinkedListQueue.hpp
│       ├── POFLFMPLinkedListQueue.hpp
│       ├── POFWFLinkedListQueue.hpp
│       ├── RomLRLinkedListQueue.hpp
│       └── RomLogLinkedListQueue.hpp
├── ptms/
│   ├── OneFilePTMLF.hpp
│   ├── OneFilePTMLFMultiProcess.hpp
│   ├── OneFilePTMWF.hpp
│   ├── PMDKTM.hpp
│   ├── README.md
│   ├── atlas/
│   │   ├── README.md
│   │   └── atlas.patch
│   ├── romuluslog/
│   │   ├── RomulusLog.cpp
│   │   ├── RomulusLog.hpp
│   │   └── malloc.cpp
│   ├── romuluslr/
│   │   ├── RomulusLR.cpp
│   │   ├── RomulusLR.hpp
│   │   └── malloc.cpp
│   └── rwlocks/
│       ├── CRWWP.hpp
│       └── CRWWP_SpinLock.hpp
└── stms/
    ├── CRWWPSTM.hpp
    ├── ESTM.hpp
    ├── OneFileLF.hpp
    ├── OneFileWF.hpp
    ├── TinySTM.hpp
    ├── estm-0.3.0/
    │   ├── .gitignore
    │   ├── AUTHORS
    │   ├── COPYING
    │   ├── Makefile
    │   ├── Makefile.in
    │   ├── README
    │   ├── VERSIONS
    │   ├── include/
    │   │   ├── mod_local.h
    │   │   ├── mod_mem.h
    │   │   ├── mod_print.h
    │   │   ├── mod_stats.h
    │   │   ├── stm.h
    │   │   └── wrappers.h
    │   └── src/
    │       ├── atomic.h
    │       ├── atomic_ops/
    │       │   ├── AUTHORS
    │       │   ├── COPYING
    │       │   ├── README
    │       │   ├── aligned_atomic_load_store.h
    │       │   ├── all_acquire_release_volatile.h
    │       │   ├── ao_t_is_int.h
    │       │   ├── atomic_ops.h
    │       │   ├── generalize-small.h
    │       │   ├── generalize.h
    │       │   ├── ia64.h
    │       │   ├── ordered_except_wr.h
    │       │   ├── powerpc.h
    │       │   ├── read_ordered.h
    │       │   ├── sparc.h
    │       │   ├── standard_ao_double_t.h
    │       │   ├── test_and_set_t_is_ao_t.h
    │       │   ├── test_and_set_t_is_char.h
    │       │   ├── x86.h
    │       │   └── x86_64.h
    │       ├── gc.c
    │       ├── gc.h
    │       ├── mod_local.c
    │       ├── mod_mem.c
    │       ├── mod_print.c
    │       ├── mod_stats.c
    │       ├── stm.c
    │       └── wrappers.c
    └── tinystm/
        ├── ChangeLog
        ├── Doxyfile
        ├── GNU-LICENSE.txt
        ├── MIT-LICENSE.txt
        ├── Makefile
        ├── Makefile.clang
        ├── Makefile.common
        ├── Makefile.gcc
        ├── Makefile.icc
        ├── Makefile.suncc
        ├── README.md
        ├── abi/
        │   ├── Makefile
        │   ├── Makefile.common
        │   ├── abi.c
        │   ├── arch_x86.S
        │   ├── dtmc/
        │   │   ├── Makefile
        │   │   ├── arch.S
        │   │   ├── libitm.h
        │   │   ├── libtanger-stm.public-symbols
        │   │   ├── libtanger-stm.support
        │   │   ├── tanger-stm-internal.h
        │   │   ├── tanger.c
        │   │   ├── tanger.h
        │   │   └── tm_macros.h
        │   ├── gcc/
        │   │   ├── Makefile
        │   │   ├── alloc_cpp.c
        │   │   ├── arch.S
        │   │   ├── clone.c
        │   │   ├── eh.c
        │   │   ├── libitm.h
        │   │   └── tm_macros.h
        │   ├── intel/
        │   │   ├── Makefile
        │   │   ├── alloc.c
        │   │   ├── arch.S
        │   │   ├── libitm.h
        │   │   └── tm_macros.h
        │   ├── libitm.h.tpl.cpp
        │   ├── libitm.h.tpl.footer
        │   ├── libitm.h.tpl.header
        │   ├── libitm.h.tpl.unifdef
        │   ├── oracle/
        │   │   ├── Makefile
        │   │   ├── arch.S
        │   │   └── otm.c
        │   ├── pthread_wrapper.h
        │   ├── test/
        │   │   └── Makefile
        │   └── tm_macros.h
        ├── include/
        │   ├── mod_ab.h
        │   ├── mod_cb.h
        │   ├── mod_log.h
        │   ├── mod_mem.h
        │   ├── mod_order.h
        │   ├── mod_print.h
        │   ├── mod_stats.h
        │   ├── stm.h
        │   └── wrappers.h
        ├── lib/
        │   └── .gitignore
        ├── src/
        │   ├── .gitignore
        │   ├── atomic.h
        │   ├── atomic_ops/
        │   │   ├── AUTHORS
        │   │   ├── COPYING
        │   │   ├── README
        │   │   ├── aligned_atomic_load_store.h
        │   │   ├── all_acquire_release_volatile.h
        │   │   ├── ao_t_is_int.h
        │   │   ├── atomic_ops.h
        │   │   ├── generalize-small.h
        │   │   ├── generalize.h
        │   │   ├── ia64.h
        │   │   ├── ordered_except_wr.h
        │   │   ├── powerpc.h
        │   │   ├── read_ordered.h
        │   │   ├── sparc.h
        │   │   ├── standard_ao_double_t.h
        │   │   ├── test_and_set_t_is_ao_t.h
        │   │   ├── test_and_set_t_is_char.h
        │   │   ├── x86.h
        │   │   └── x86_64.h
        │   ├── gc.c
        │   ├── gc.h
        │   ├── mod_ab.c
        │   ├── mod_cb_mem.c
        │   ├── mod_log.c
        │   ├── mod_order.c
        │   ├── mod_print.c
        │   ├── mod_stats.c
        │   ├── stm.c
        │   ├── stm_internal.h
        │   ├── stm_wbctl.h
        │   ├── stm_wbetl.h
        │   ├── stm_wt.h
        │   ├── tls.h
        │   ├── utils.h
        │   └── wrappers.c
        └── test/
            ├── Makefile
            ├── intset/
            │   ├── .gitignore
            │   ├── Makefile
            │   ├── README.rbtree
            │   ├── intset.c
            │   ├── rbtree.c
            │   ├── rbtree.h
            │   ├── tm.h
            │   └── types.h
            └── regression/
                ├── .gitignore
                ├── Makefile
                ├── irrevocability.c
                ├── perf.c
                └── types.c

================================================
FILE CONTENTS
================================================

================================================
FILE: LICENSE.txt
================================================
Copyright (c) 2017-2018
  Andreia Correia <andreia.veiga@unine.ch>
  Pedro Ramalhete <pramalhe@gmail.com>
  Pascal Felber <pascal.felber@unine.ch>
  Nachshon Cohen <nachshonc@gmail.com>

Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:

The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


================================================
FILE: README.md
================================================

# OneFile PTM / STM

OneFile is a Software Transactional Memory (STM) meant to make it easy to implement lock-free and wait-free data structures.
It is based on the paper "[OneFile: A Wait-free Persistent Transactional Memory](https://github.com/pramalhe/OneFile/blob/master/OneFile-2019.pdf)" by Ramalhete, Correia, Felber and Cohen
https://github.com/pramalhe/OneFile/blob/master/OneFile-2019.pdf

It provides multi-word atomic updates on *tmtype<T>* objects, where T must be word-size, typically a pointer or integer.
During a transaction, each store on an *tmtpye<T>* is transformed into a double-word-compare-and-swap DCAS() and one more regular CAS() is done to complete the transaction. It does this with a store-log (write-set) which other writers can help apply. 
This is a "redo-log" based technique, which means that both store and loads need to be interposed. Stores will be interposed to save them in the log and loads will be interposed to lookup on the log the most recent value.
If there is a transaction currently ongoing, the readers will have to check on each *tmtype::pload()* if the variable we're tying to read is part of the current transaction. If a value is read whose 'seq' is higher than the transaction we initially read, the whole read-only operation will be restarted, by throwing an exception in the *tmtype::pload()* interposing method and catching this exception by the TM. All of this logic is handled internally by OF without any explicit user interaction.

Because of operator overloading, the assignment and reading of *tmtype<T>* types is done transparently to the user, with a pure library implementation, without any need for compiler instrumentation. This means that the user can write the code as if it was a sequential implementation of the data structure, apart from the change of types (type annotation). In this sense, OneFile is a "quasi-universal construction" with lock-free progress.

Our design goal with OneFile was to provide a non-blocking STM so that non-experts could implement their own lock-free and wait-free data structures.
OneFile is not designed to transform regular everyday code into lock-free applications. Such use-cases require a lot more of engineering work and likely a completely different approach from what we took with OneFile (CX is a much better option for that purpose).

We've made two implementations in the form of Persistent Transactional Memory (PTM) which are STMs meant for Persistent Memory, like Intel's Optane DC Persistent Memory.

We've implemented four diferent variants of this design:
- OneFile-LF: The simplest of the four, has lock-free progress and lock-free memory reclamation using Hazard Eras;
- OneFile-WF: Uses aggregation (like flat-combining) and a new wait-free consensus to provide wait-free bounded progress. Has wait-free bounded memory reclamation;
- POneFile-LF: A PTM with durable transactions (ACID) and lock-free progress. Memory reclamation is lock-free using an optimistic technique. Allocation and de-allocation of user objects is lock-free;
- POnefile-WF: A PTM with durable transactions (ACID) and wait-free progress. Memory reclamation for user objects is wait-free using an optimistic technique, while memory reclamation of the transactional objects is done using Hazard Eras, also wait-free. Allocation and de-allocation of user-objects is wait-free;

See the respective .hpp files for implementation details.

Each implementation is a single header file. Yes, it's that small  :)


## Quickstart ##

If you just want to use OneFile in your own application or benchmarks then follow these steps:
- Choose one of the four OneFile implementations, depending on whether you want and STM, a PTM, lock-free or wait-free progress:  
  [stms/OneFileLF.hpp](https://github.com/pramalhe/OneFile/blob/master/stms/OneFileLF.hpp)      STM with lock-free transactions  
  [stms/OneFileWF.hpp](https://github.com/pramalhe/OneFile/blob/master/stms/OneFileWF.hpp)      STM with wait-free transactions  
  [ptms/POneFileLF.hpp](https://github.com/pramalhe/OneFile/blob/master/ptms/OneFilePTMLF.hpp)     PTM with lock-free transactions  
  [ptms/POneFileWF.hpp](https://github.com/pramalhe/OneFile/blob/master/ptms/OneFilePTMWF.hpp)     PTM with wait-free transactions  
- Copy the header to your development folder
- Include the header from a single .cpp. If you include from multiple compilation units (.cpp files) then move the last block in the .hpp to one of the .cpp files.
- If you want a data structure that is already made then take a look at what's on these folders:

	datastructures/         Data structures for volatile memory (needs one of the STMs)
	pdatastructures/        Data structures for persistent memory (needs one of the PTMs)


### Design ###

In OneFile STM a transaction goes through three phases. 
The first phase is to convert the operation (lambda) into a store-log (write-set). There is no need to save the loads (read-set) because unlike other approaches, a transaction does not need to re-check for changes at commit time: it does a check in-flight on each load of whether or not the value has changed since the beginning of the transaction, by looking at a sequence number associated with every read value, a technique similar to TL2 or TinySTM but without the need for keeping a read-set because all write transactions are executed one at a time, effectively serialized.
The second phase is to commit the transaction by advancing the current transaction (curTx).
The third phase is to apply the store-log using DCAS.

The first phase is implicitly serializable. Even if each thread publishes its operation, there is no way to parallelize this work among threads. The best that could be done would be that each thread to transform its own operation into its own store-log which it then appends to a global store-log. Unfortunately this is possible only for disjoint-access parallel transactions, and these are not easy to detect, therefore, our implementation of OneFile does not do this.
Instead, we attempt to parallelize the second stage, where the store-log is applied. This task is easier to split among multiple threads, thus parallelizing it.
Adding Flat-Combining or other similar aggregation techniques to the first stage, means that each thread will produce a store-log containing the operations of all other threads. This can be a bottleneck if the operations involves heavy computations and small store-logs. For data structures, this is not the case, and OneFile is designed to implement and work with data structures or other scenarios where the transactions are short in time, therefore, we found it acceptable to go with such an approach.

The parallelization of the third phase can be done with at least two different approaches: blocking and non-blocking.
In the blocking approach, the store-log can be divided into chunks (for example, one chunk per thread), each chunk having a lock, and the thread that takes the lock is responsible for applying that chunk.
In the non-blocking approach (OF-LF and OF-WF), each thread tries to apply an entry of the store-log at a time. To avoid ABA issues, a double-word compare-and-swap (DCAS) must be used.

In summary, OneFile does *not* do disjoint-access parallel transactions. If you absolutely need that functionality, then go and take a look at TinySTM. 


## Requirements ##

- OneFile needs a double-word CAS, which limits it to x86. The algorithm can be modified to use LL/SC or even single-word CAS at the cost of losing its generic capability because bits would have to be stolen from a 64 bit wordl
- The user must "instrument" the code where the atomic updates take place by wrapping the types with *tmtype<T>*. Even then, the operator overloading will not cover all the cases and there will be situations where the user has to annotate the code with .pload() or .pstore() respectively;
- The *T* type must be the size of a word, i.e. 64 bits. Anything bigger and it needs to be splitted into multiple *tmtype<T>* objects;
- If memory reclamation is needed, then the objects need to derive from the *tmbase* base class, need to be allocated with *tmNew()* or *tmMalloc()* and deallocated with *tmDelete()* or *tmFree()*;


## Memory Reclamation ##

We're using a customized implementation of Hazard Eras, lock-free/wait-free memory reclamation:  
[https://github.com/pramalhe/ConcurrencyFreaks/blob/master/papers/hazarderas-2017.pdf](https://github.com/pramalhe/ConcurrencyFreaks/blob/master/papers/hazarderas-2017.pdf)  
[https://dl.acm.org/citation.cfm?id=3087588](https://dl.acm.org/citation.cfm?id=3087588)  
See the HazardErasOF class in each implementation for more details.  

As far as we know, there is only one wait-free data structure that has integrated wait-free memory reclamation:  
[https://github.com/pramalhe/ConcurrencyFreaks/blob/master/papers/crturnqueue-2016.pdf]([https://github.com/pramalhe/ConcurrencyFreaks/blob/master/papers/crturnqueue-2016.pdf)  
OneFile and CX are the first time that someone has made a generic mechanism for wait-free memory reclamation.


## How to use this ##

1. Annotate all the objects that are shared among threads, namely, everything that is *std::atomic<T>* should be changed to *tmtype<T>*;
2. Use only *pstore()* and *pload()* (or just use '='). Do *not* call compare_exchange_strong(), exchange() or fetch_add();
3. Replace calls to "obj = new T(args)" with "obj = tmNew<T>(args)";
4. Replace calls to "delete obj" with "tmDelete<T>(obj)";
5. The T types must derive from the base class *tmbase*;
6. Place your methods in a lambda, capturing whatever you need, and pass the lambda to *updateTx()*;
That's it, you've now got your own lock-free data structure!
For an example of a simple linked-list set, take a look datastructures/linkedlists/OFLFLinkedListSet.hpp


## Disadvantages ##

- All mutative operations are serialized;
- Types must be broken down to 64 bit sizes;
- Requires Double-word-compare-and-swap (DCAS);


## Advantages ##

- Lock-free programming was never so easy, all the user code has to do is loads and stores on *tmtypes<T>* types and those get transformed into a DCAS() based transaction that provides correct linearizable lock-free progress, without ABA issues;
- Memory reclamation is also handled by OF using Hazard Eras, a lock-free/wait-free memory reclamation technique;
- Compared to hand-written lock-free data structures, on the uncontended case, we are replacing each CAS with a DCAS and adding one extra (regular) CAS on the currTrans, which is a small price to pay for the atomicity;
- This technique provides full linearizability for generic code, even mutative iterators, something which is nearly impossible to do with hand-written lock-free data structures;
- Multiple helping threads can help apply starting on different places. A good heuristic is to start from the (tid % numStores);
- OneFile-WF is the first STM with wait-free bounded progress and it's the first to have wait-free bounded progress with wait-free bounded memory reclamation.
- Read-only transactions are lightweight and they can run concurrently with write transactions as long as they're disjoint;

The biggest advantage of all is that it's way easier to use OneFile than it is to implement a hand-made lock-free or wait-free data structure.


## Examples ##

There are some working examples in the "datastructures/" folder:
OFLFBoundedQueue.hpp:     An array based queue (memory-bounded) 
OFLFLinkedListQueue.hpp:  A linked list based queue (memory unbounded) 
OFLFLinkedListSet.hpp:    A linked list based set 
OFLFRedBlackBST.hpp:      A Red-Black (balanced) tree map  


## Benchmarks ##
To build the benchmarks you need to build ESTM and TinySTM, and then you need to pull PMDK (PMEM/NVML) and build it:

    cd ~/onefile/stms/
    cd estm-0.3.0
    make clean ; make
    cd ..
    cd tinystm
    make clean ; make
    cd ..
    cd ~
    git clone https://github.com/pmem/pmdk.git
    cd pmdk
    make -j12
    sudo make install
    export PMEM_IS_PMEM_FORCE=1
    cd ~/onefile/graphs
    make -j12
    

## Tests ##
The four implementations of OneFile were executed during thousands of cpu hours and heavily stress tested with invariant checking and using tools like address sanitizer and valgrind. This is a lot more than what other STMs on github provide, but it doesn't mean there are no bugs in it  ;)
If you see a crash or invariant failure, run the same code under a global rw-lock to make sure the bug is not in your code. If you really believe it's in OneFile, then please open a bug on github and add as much information as you can, namely, stack trace and files needed to reproduce. 
We'll do our best to address it.

 
================================================
FILE: common/HazardEras.hpp
================================================
/******************************************************************************
 * Copyright (c) 2016-2017, Pedro Ramalhete, Andreia Correia
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of Concurrency Freaks nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.

 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************************************************************
 */

#ifndef _HAZARD_ERAS_H_
#define _HAZARD_ERAS_H_

#include <atomic>
#include <iostream>
#include <vector>
#include <algorithm>

/*
 * <h1> Hazard Eras </h1>
 * This a light-weight implementation of hazard eras, where each thread has a
 * thread-local list of retired objects.
 *
 * This is based on the paper "Hazard Eras - Non-Blocking Memory Reclamation"
 * by Pedro Ramalhete and Andreia Correia:
 * github...
 *
 * The type T is for the objects/nodes and it's expected to have the following members:
 * newEra, delEra, delNext.
 *
 * R is zero.
 *
 * <p>
 * @author Pedro Ramalhete
 * @author Andreia Correia
 */
template<typename T>
class HazardEras {

private:
    static const uint64_t NONE = 0;
    static const int      HE_MAX_THREADS = 128;
    static const int      MAX_HES = 5;        // This is named 'K' in the HP paper
    static const int      CLPAD = 128/sizeof(std::atomic<T*>);
    static const int      HE_THRESHOLD_R = 0; // This is named 'R' in the HP paper

    const int             maxHEs;
    const int             maxThreads;

    alignas(128) std::atomic<uint64_t>  eraClock {1};
    alignas(128) std::atomic<uint64_t>* he[HE_MAX_THREADS];
    alignas(128) std::vector<T*>        retiredList[HE_MAX_THREADS*CLPAD];  // It's not nice that we have a lot of empty vectors

public:
    HazardEras(int maxHEs=MAX_HES, int maxThreads=HE_MAX_THREADS) : maxHEs{maxHEs}, maxThreads{maxThreads} {
        for (int it = 0; it < HE_MAX_THREADS; it++) {
            he[it] = new std::atomic<uint64_t>[CLPAD*2]; // We allocate four cache lines to allow for many hps and without false sharing
            retiredList[it*CLPAD].reserve(maxThreads*maxHEs);
            for (int ihe = 0; ihe < MAX_HES; ihe++) {
                he[it][ihe].store(NONE, std::memory_order_relaxed);
            }
        }
        static_assert(std::is_same<decltype(T::newEra), uint64_t>::value, "T::newEra must be uint64_t");
        static_assert(std::is_same<decltype(T::delEra), uint64_t>::value, "T::delEra must be uint64_t");
    }

    ~HazardEras() {
        for (int it = 0; it < HE_MAX_THREADS; it++) {
            delete[] he[it];
            // Clear the current retired nodes
            for (unsigned iret = 0; iret < retiredList[it*CLPAD].size(); iret++) {
                delete retiredList[it*CLPAD][iret];
            }
        }
    }


    inline uint64_t getEra() {
        return eraClock.load();
    }


    /**
     * Progress Condition: wait-free bounded (by maxHEs)
     */
    inline void clear(const int tid) {
        for (int ihe = 0; ihe < maxHEs; ihe++) {
            he[tid][ihe].store(NONE, std::memory_order_release);
        }
    }


    /**
     * Progress Condition: lock-free
     */
    inline T* get_protected(int index, const std::atomic<T*>& atom, const int tid) {
        auto prevEra = he[tid][index].load(std::memory_order_relaxed);
		while (true) {
		    T* ptr = atom.load();
		    auto era = eraClock.load(std::memory_order_acquire);
		    if (era == prevEra) return ptr;
            he[tid][index].store(era);
            prevEra = era;
		}
    }

    inline void protectEraRelease(int index, int other, const int tid) {
        auto era = he[tid][other].load(std::memory_order_relaxed);
        if (he[tid][index].load(std::memory_order_relaxed) == era) return;
        he[tid][index].store(era, std::memory_order_release);
    }


    /*
     * Does a single iteration. Must be integrated into the algorithm that's using HE.
     * In other words, we must re-check if era has changed
     *
     * Progress Condition: wait-free population oblivious
     */
    inline T* protectPtr(int index, const std::atomic<T*>& atom, uint64_t& prevEra, const int tid) {
        T* ptr = atom.load(std::memory_order_acquire);
        auto era = eraClock.load();
        if (prevEra != era) {
            prevEra = era;
            he[tid][index].store(era, std::memory_order_relaxed);
            std::atomic_thread_fence(std::memory_order_seq_cst);
        }
        return ptr;
    }


    /**
     * Retire an object (node)
     * Progress Condition: wait-free bounded
     */
    void retire(T* ptr, const int mytid) {
        auto currEra = eraClock.load();
        ptr->delEra = currEra;
        auto& rlist = retiredList[mytid*CLPAD];
        rlist.push_back(ptr);
        if (eraClock == currEra) eraClock.fetch_add(1);
        for (unsigned iret = 0; iret < rlist.size();) {
            auto obj = rlist[iret];
            if (canDelete(obj, mytid)) {
                rlist.erase(rlist.begin() + iret);
                delete obj;
                continue;
            }
            iret++;
        }
    }

private:
    bool canDelete(T* obj, const int mytid) {
        for (int tid = 0; tid < maxThreads; tid++) {
            for (int ihe = 0; ihe < maxHEs; ihe++) {
                const auto era = he[tid][ihe].load(std::memory_order_acquire);
                if (era == NONE || era < obj->newEra || era > obj->delEra) continue;
                return false;
            }
        }
        return true;
    }

};

#endif /* _HAZARD_ERAS_H_ */


================================================
FILE: common/HazardPointers.hpp
================================================
/******************************************************************************
 * Copyright (c) 2014-2016, Pedro Ramalhete, Andreia Correia
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of Concurrency Freaks nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.

 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************************************************************
 */

#ifndef _HAZARD_POINTERS_H_
#define _HAZARD_POINTERS_H_

#include <atomic>
#include <iostream>


/**
 * This is a customized version of Hazard Pointers to be used with CXMutation
 */
// TODO: use std::vector instead of arrays for the retired objects (keep the padding)
template<typename T>
class HazardPointers {

private:
    static const int      MAX_THREADS = 128;
    static const int      MAX_HPS = 5;
    static const int      MAX_RETIRED = MAX_THREADS*MAX_HPS;
    static const int      HP_THRESHOLD_R = 0;  // This is named 'R' in the HP paper
    static const int      CLPAD = 128/sizeof(std::atomic<T*>);
    const int             maxHPs;
    const int             maxThreads;
    alignas(128) std::atomic<T*>*      hp[MAX_THREADS*CLPAD];
    alignas(128) T**                   retiredObjects[MAX_THREADS*CLPAD];  // List of retired nodes that need to be 'deleted' for the current thread
    alignas(128) long                  numRetiredObjects[MAX_THREADS*CLPAD];       // Number of nodes in the retired list
    // Used specifically for CXMutation
    alignas(128) std::atomic<T*>       heads[2*MAX_THREADS*CLPAD];

public:
    HazardPointers(int maxHPs=MAX_HPS, int maxThreads=MAX_THREADS) : maxHPs{maxHPs}, maxThreads{maxThreads} {
        for (int ih = 0; ih < 2*MAX_THREADS; ih++) {
            heads[ih*CLPAD].store(nullptr, std::memory_order_relaxed);
        }
        for (int ithread = 0; ithread < MAX_THREADS; ithread++) {
        	numRetiredObjects[ithread*CLPAD] = 0;
            hp[ithread*CLPAD] = new std::atomic<T*>[MAX_HPS];
            for (int ihp = 0; ihp < MAX_HPS; ihp++) {
                hp[ithread*CLPAD][ihp].store(nullptr, std::memory_order_relaxed);
            }
            retiredObjects[ithread*CLPAD] = new T*[MAX_RETIRED];
            for (int iret = 0; iret < MAX_RETIRED; iret++) {
                retiredObjects[ithread*CLPAD][iret] = nullptr;
            }
        }
    }

    ~HazardPointers() {
        for (int ithread = 0; ithread < MAX_THREADS; ithread++) {
            // Clear the current retired nodes
            for (int iret = 0; iret < numRetiredObjects[ithread*CLPAD]; iret++) {
                delete (T*)retiredObjects[ithread*CLPAD][iret];
            }
            delete[] hp[ithread*CLPAD];
            delete[] retiredObjects[ithread*CLPAD];
        }
    }


    /**
     * Progress Condition: wait-free bounded (by maxHPs)
     *
     * It's ok to use relaxed loads here because:
     * - For progress: we know that the store will eventually become visible,
     *   or another publish() will take its place;
     * - For correctness: it can be re-ordered below, but at most it will protect
     *   an object for longer than required, i.e. until the next publish overwrites it.
     *   Or it gets re-ordered above, but only up to a seq-cst store on the same
     *   variable in publish(), which _must_ be it, even if the store in the publish
     *   is a release store (which is the case for publishRelease()).
     */
    void clear(const int tid) {
        for (int ihp = 0; ihp < maxHPs; ihp++) {
            hp[tid*CLPAD][ihp].store(nullptr, std::memory_order_relaxed);
        }
    }


    /**
     * Progress Condition: wait-free population oblivious
     */
    void clearOne(int ihp, const int tid) {
        hp[tid*CLPAD][ihp].store(nullptr,std::memory_order_relaxed);
    }


    /**
     * Progress Condition: lock-free
     */
    T* protect(int index, const std::atomic<T*>& atom, const int tid) {
        T* n = nullptr;
        T* ret;
		while ((ret = atom.load()) != n) {
			hp[tid*CLPAD][index].store(ret);
			n = ret;
		}
		return ret;
    }

    inline T* get_protected(int index, const std::atomic<T*>& atom, const int tid) {
        return protect(index, atom, tid);
    }

    /**
     * This returns the same value that is passed as ptr, which is sometimes useful
     * Progress Condition: wait-free population oblivious
     */
    T* protectPtr(int index, T* ptr, const int tid) {
        hp[tid*CLPAD][index].store(ptr);
        return ptr;
    }

    /**
     * This assumes that the ptr lhead is already protected by a "regular" hazard pointers
     */
    void protectHead(int combinedIndex, T* lhead) {
        heads[combinedIndex*CLPAD].store(lhead, std::memory_order_release);
    }

    std::atomic<T*>* getHeads() {
        return heads;
    }

    /**
     * This returns the same value that is passed as ptr, which is sometimes useful
     * Progress Condition: wait-free population oblivious
     */
    T* protectRelease(int index, T* ptr, const int tid) {
        hp[tid*CLPAD][index].store(ptr, std::memory_order_release);
        return ptr;
    }

    /**
     * This returns the same value that is passed as ptr, which is sometimes useful
     * Progress Condition: wait-free bounded (by the number of threads squared)
     */
    void retire(T* ptr, const int tid) {
        if (numRetiredObjects[tid*CLPAD] >= HP_THRESHOLD_R) scanAndDelete(tid);
        retiredObjects[tid*CLPAD][numRetiredObjects[tid*CLPAD]++] = ptr;
    }


    void copyPtr(int index, int other, const int tid) {
        auto ptr = hp[tid*CLPAD][other].load(std::memory_order_relaxed);
        hp[tid*CLPAD][index].store(ptr, std::memory_order_release);
    }


private:
    void scanAndDelete(const int tid) {
        for (int iret = 0; iret < numRetiredObjects[tid*CLPAD]; ) {
            bool ptrInUse = false;
            auto ptr = (T*)retiredObjects[tid*CLPAD][iret];
            for (int it = 0; it < maxThreads; it++) {
                for (int ihp = maxHPs-1; ihp >= 0; ihp--) {
                    if (ptr == hp[it*CLPAD][ihp].load()) ptrInUse = true;
                }
            }
            if (ptrInUse) { iret++; continue; }
            // Scan the array of heads before deleting the pointer
            for (int icomb = 0; icomb < 2*MAX_THREADS; icomb++) {
                if (ptr == heads[icomb*CLPAD].load()) ptrInUse = true;
            }
            if (ptrInUse) { iret++; continue;  }
            for (int i = iret; i < numRetiredObjects[tid*CLPAD]-1; i++) retiredObjects[tid*CLPAD][i] = retiredObjects[tid*CLPAD][i+1];
            numRetiredObjects[tid*CLPAD]--;
            delete ptr;

        }
    }
};

#endif /* _HAZARD_POINTERS_H_ */


================================================
FILE: common/HazardPointersSimQueue.hpp
================================================
/******************************************************************************
 * Copyright (c) 2014-2016, Pedro Ramalhete, Andreia Correia
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of Concurrency Freaks nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.

 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************************************************************
 */

#ifndef _HAZARD_POINTERS_SIM_QUEUE_H_
#define _HAZARD_POINTERS_SIM_QUEUE_H_

#include <atomic>
#include <iostream>
#include <functional>
#include <vector>


/*
 * The main difference from this implementation to regular Hazard Pointers is
 * that the constructor takes a function pointer to function 'find' which
 * acts as a callback, returning true if the pointer is still stored somewhere
 * in the data structure. This is used by SimQueue to indicate if there is a
 * pointer to the object we're trying to de-allocate in the array of enqReused.
 */
template<typename T>
class HazardPointersSimQueue {

private:
    static const int      HP_MAX_THREADS = 128;
    static const int      HP_MAX_HPS = 11;     // This is named 'K' in the HP paper
    static const int      CLPAD = 128/sizeof(std::atomic<T*>);
    static const int      HP_THRESHOLD_R = 0; // This is named 'R' in the HP paper
    static const int      MAX_RETIRED = HP_MAX_THREADS*HP_MAX_HPS; // Maximum number of retired objects per thread

    const int             maxHPs;
    const int             maxThreads;

    std::atomic<T*>       hp[HP_MAX_THREADS*CLPAD][HP_MAX_HPS];
    // It's not nice that we have a lot of empty vectors, but we need padding to avoid false sharing
    std::vector<T*>       retiredList[HP_MAX_THREADS*CLPAD];

    std::function<bool(T*)> findPtr;

public:
    HazardPointersSimQueue(std::function<bool(T*)>& find, int maxHPs=HP_MAX_HPS, int maxThreads=HP_MAX_THREADS) : maxHPs{maxHPs}, maxThreads{maxThreads} {
        findPtr = find;
        for (int ithread = 0; ithread < HP_MAX_THREADS; ithread++) {
            for (int ihp = 0; ihp < HP_MAX_HPS; ihp++) {
                hp[ithread*CLPAD][ihp].store(nullptr, std::memory_order_relaxed);
            }
        }
    }

    ~HazardPointersSimQueue() {
        for (int ithread = 0; ithread < HP_MAX_THREADS; ithread++) {
            // Clear the current retired nodes
            for (unsigned iret = 0; iret < retiredList[ithread*CLPAD].size(); iret++) {
                delete retiredList[ithread*CLPAD][iret];
            }
        }
    }


    /**
     * Progress Condition: wait-free bounded (by maxHPs)
     */
    void clear(const int tid) {
        for (int ihp = 0; ihp < maxHPs; ihp++) {
            hp[tid*CLPAD][ihp].store(nullptr, std::memory_order_release);
        }
    }


    /**
     * Progress Condition: wait-free population oblivious
     */
    void clearOne(int ihp, const int tid) {
        hp[tid*CLPAD][ihp].store(nullptr, std::memory_order_release);
    }


    /**
     * Progress Condition: lock-free
     */
    T* protect(int index, const std::atomic<T*>& atom, const int tid) {
        T* n = nullptr;
        T* ret;
		while ((ret = atom.load()) != n) {
			hp[tid*CLPAD][index].store(ret);
			n = ret;
		}
		return ret;
    }

    /**
     * This returns the same value that is passed as ptr, which is sometimes useful
     * Progress Condition: wait-free population oblivious
     */
    T* protectPtr(int index, T* ptr, const int tid) {
        hp[tid*CLPAD][index].store(ptr);
        return ptr;
    }


    /**
     * This returns the same value that is passed as ptr, which is sometimes useful
     * Progress Condition: wait-free population oblivious
     */
    T* protectRelease(int index, T* ptr, const int tid) {
        hp[tid*CLPAD][index].store(ptr, std::memory_order_release);
        return ptr;
    }


    /**
     * This returns the same value that is passed as ptr, which is sometimes useful
     * Progress Condition: wait-free bounded (by the number of threads squared)
     */
    void retire(T* ptr, const int tid) {
        retiredList[tid*CLPAD].push_back(ptr);
        for (unsigned iret = 0; iret < retiredList[tid*CLPAD].size();) {
            auto obj = retiredList[tid*CLPAD][iret];
            if (findPtr(obj)) {
                iret++;
                continue;
            }
            bool canDelete = true;
            for (int tid = 0; tid < maxThreads && canDelete; tid++) {
                for (int ihp = maxHPs-1; ihp >= 0; ihp--) {
                    if (hp[tid*CLPAD][ihp].load() == obj) {
                        canDelete = false;
                        break;
                    }
                }
            }
            if (canDelete) {
                retiredList[tid*CLPAD].erase(retiredList[tid*CLPAD].begin() + iret);
                delete obj;
                continue;
            }
            iret++;
        }
    }
};

#endif /* _HAZARD_POINTERS_H_ */


================================================
FILE: common/README.md
================================================
Here are some files that are needed by other libraries and data structures:

    HazardEras.hpp              Used by some of the lock-free data structures for memory reclamation
    HazardPointers.hpp          Used by some of the lock-free data structures for memory reclamation
    HazardPointersSimQueue.hpp  Used by SimQueue for memory reclamation. Notice that the original SimQueue implementation in C does not ha memory reclamation. This implementation in C++ with this modified version of Hazard Pointers was done by Correia and Ramalhete
    pfences.h                   Used by Romulus
    RIStaticPerThread.hpp       Used by Romulus
    ThreadRegistry.cpp          Used by Romulus
    ThreadRegistry.hpp          Used by Romulus

================================================
FILE: common/RIStaticPerThread.hpp
================================================
/******************************************************************************
 * Copyright (c) 2014-2016, Pedro Ramalhete, Andreia Correia
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of Concurrency Freaks nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.

 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************************************************************
 */

#ifndef _RISTATIC_H_
#define _RISTATIC_H_

#include <atomic>
#include <iostream>
#include <cstdint>


// TODO: change all calls that need the tid to use a function argument
// TODO: use std::vector instead of arrays for the retired objects (keep the padding)
class RIStaticPerThread {

private:
    const int maxThreads;
    alignas(128) std::atomic<uint64_t>* states;

    static const uint64_t NOT_READING = 0;
    static const uint64_t READING = 1;
    static const int CLPAD = 128/sizeof(uint64_t);

public:
    RIStaticPerThread(int maxThreads) : maxThreads{maxThreads} {
        states = new std::atomic<uint64_t>[maxThreads*CLPAD];
        for (int tid = 0; tid < maxThreads; tid++) {
            states[tid*CLPAD].store(NOT_READING, std::memory_order_relaxed);
        }
    }

    ~RIStaticPerThread() {
        delete[] states;
    }

    // Will attempt to pass all current READING states to
    inline void abortRollback() noexcept {
        for (int tid = 0; tid < maxThreads; tid++) {
            if (states[tid*CLPAD].load() != READING) continue;
            uint64_t read = READING;
            states[tid*CLPAD].compare_exchange_strong(read, READING+1);
        }
    }

    // Returns true if the arrival was successfully rollbacked.
    // If there was a writer changing the state to READING+1 then it will
    // return false, meaning that the arrive() is still valid and visible.
    inline bool rollbackArrive(const int tid) noexcept {
        return (states[tid*CLPAD].fetch_add(-1) == READING);
    }

    inline void arrive(const int tid) noexcept {
        states[tid*CLPAD].store(READING);
    }

    inline void depart(const int tid) noexcept {
        states[tid*CLPAD].store(NOT_READING); // Making this "memory_order_release" will cause overflows!
    }

    inline bool isEmpty() noexcept {
        for (int tid = 0; tid < maxThreads; tid++) {
            if (states[tid*CLPAD].load() != NOT_READING) return false;
        }
        return true;
    }
};

#endif /* RISTATIC_H */


================================================
FILE: common/ThreadRegistry.cpp
================================================
/*
 * Contains all global variables.
 */
#include "common/ThreadRegistry.hpp"

// Global/singleton to hold all the thread registry functionality
ThreadRegistry gThreadRegistry {};

// This is where every thread stores the tid it has been assigned when it calls getTID() for the first time.
// When the thread dies, the destructor of ThreadCheckInCheckOut will be called and de-register the thread.
thread_local ThreadCheckInCheckOut tl_tcico {};

void thread_registry_deregister_thread(const int tid) {
    gThreadRegistry.deregister_thread(tid);
}


================================================
FILE: common/ThreadRegistry.hpp
================================================
#ifndef _THREAD_REGISTRY_H_
#define _THREAD_REGISTRY_H_

#include <atomic>
#include <thread>
#include <iostream>
#include <cassert>

// Increase this if 128 threads is not enough
static const int REGISTRY_MAX_THREADS = 128;


extern void thread_registry_deregister_thread(const int tid);


// An helper class to do the checkin and checkout of the thread registry
struct ThreadCheckInCheckOut {
    static const int NOT_ASSIGNED = -1;
    int tid { NOT_ASSIGNED };
    ~ThreadCheckInCheckOut() {
        if (tid == NOT_ASSIGNED) return;
        thread_registry_deregister_thread(tid);
    }
};


extern thread_local ThreadCheckInCheckOut tl_tcico;


// Forward declaration of global/singleton instance
class ThreadRegistry;
extern ThreadRegistry gThreadRegistry;


/*
 * <h1> Registry for threads </h1>
 *
 * This is singleton type class that allows assignement of a unique id to each thread.
 * The first time a thread calls ThreadRegistry::getTID() it will allocate a free slot in 'usedTID[]'.
 * This tid wil be saved in a thread-local variable of the type ThreadCheckInCheckOut which
 * upon destruction of the thread will call the destructor of ThreadCheckInCheckOut and free the
 * corresponding slot to be used by a later thread.
 * RomulusLR relies on this to work properly.
 */
class ThreadRegistry {
private:
    alignas(128) std::atomic<bool>      usedTID[REGISTRY_MAX_THREADS];   // Which TIDs are in use by threads
    alignas(128) std::atomic<int>       maxTid {-1};                     // Highest TID (+1) in use by threads

public:
    ThreadRegistry() {
        for (int it = 0; it < REGISTRY_MAX_THREADS; it++) {
            usedTID[it].store(false, std::memory_order_relaxed);
        }
    }

    /*
     * Progress Condition: wait-free bounded (by the number of threads)
     */
    int register_thread_new(void) {
        for (int tid = 0; tid < REGISTRY_MAX_THREADS; tid++) {
            if (usedTID[tid].load(std::memory_order_acquire)) continue;
            bool unused = false;
            if (!usedTID[tid].compare_exchange_strong(unused, true)) continue;
            // Increase the current maximum to cover our thread id
            int curMax = maxTid.load();
            while (curMax <= tid) {
                maxTid.compare_exchange_strong(curMax, tid+1);
                curMax = maxTid.load();
            }
            tl_tcico.tid = tid;
            return tid;
        }
        std::cout << "ERROR: Too many threads, registry can only hold " << REGISTRY_MAX_THREADS << " threads\n";
        assert(false);
    }

    /*
     * Progress condition: wait-free population oblivious
     */
    inline void deregister_thread(const int tid) {
        usedTID[tid].store(false, std::memory_order_release);
    }

    /*
     * Progress condition: wait-free population oblivious
     */
    static inline uint64_t getMaxThreads(void) {
        return gThreadRegistry.maxTid.load(std::memory_order_acquire);
    }

    /*
     * Progress condition: wait-free bounded (by the number of threads)
     */
    static inline int getTID(void) {
        int tid = tl_tcico.tid;
        if (tid != ThreadCheckInCheckOut::NOT_ASSIGNED) return tid;
        return gThreadRegistry.register_thread_new();
    }
};

#endif /* _THREAD_REGISTRY_H_ */


================================================
FILE: common/pfences.h
================================================
/*
 * Copyright 2017-2018
 *   Andreia Correia <andreia.veiga@unine.ch>
 *   Pedro Ramalhete <pramalhe@gmail.com>
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Nachshon Cohen <nachshonc@gmail.com>
 *
 * This work is published under the MIT license. See LICENSE.txt
 */
#ifndef _PERSISTENT_FENCES_
#define _PERSISTENT_FENCES_

/*
 * The naming for these macros and respective operations were taken from the excellent
 * "Preserving Happens-before in Persistent Memory" by Izraelevitz, Mendes, and Scott
 * https://www.cs.rochester.edu/u/jhi1/papers/2016-spaa-transform
 *
 * We have five different definitions of pwb/pfence/psync:
 * - Emulated: We introduce a delay on stores, like Mnemosyne does
 * - Nothing: only works with process restart persistency, i.e. process failures, but not system failure
 * - Define pwb as clflush (Broadwell cpus)
 * - Define pwb as clflushopt (most x86 cpus)
 * - Define pwb as clwb (only very recent cpus have this instruction)
 */

/*
 * We copied the methods from Menmosyne:
 * http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.221.5462&rep=rep1&type=pdf
 */
static inline unsigned long long asm_rdtsc(void)
{
    unsigned hi, lo;
    __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
    return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
}

// Change this depending on the clock cycle of your cpu. For Cervino it's 2100, for my laptop it's 2712.
#define EMULATED_CPUFREQ  2100

#define NS2CYCLE(__ns) ((__ns) * EMULATED_CPUFREQ / 1000)

static inline void emulate_latency_ns(int ns) {
    uint64_t stop;
    uint64_t start = asm_rdtsc();
    uint64_t cycles = NS2CYCLE(ns);
    do {
        /* RDTSC doesn't necessarily wait for previous instructions to complete
         * so a serializing instruction is usually used to ensure previous
         * instructions have completed. However, in our case this is a desirable
         * property since we want to overlap the latency we emulate with the
         * actual latency of the emulated instruction.
         */
        stop = asm_rdtsc();
    } while (stop - start < cycles);
}


/*
 * We use the settings on the delays for emulation from the NVMOVE paper:
 * http://www.cs.utexas.edu/~vijay/papers/inflow16-nvmove.pdf
 */
#ifdef PWB_IS_STT
  /* Delays for emulating STT in DRAM */
  #define PWB(addr)              emulate_latency_ns(140)
  #define PFENCE()               emulate_latency_ns(200)
  #define PSYNC()                emulate_latency_ns(200)
#elif PWB_IS_PCM
  /* Delays for emulating PCM in DRAM */
  #define PWB(addr)              emulate_latency_ns(340)
  #define PFENCE()               emulate_latency_ns(500)
  #define PSYNC()                emulate_latency_ns(500)
#elif PWB_IS_CLFLUSH
  /*
   * More info at http://elixir.free-electrons.com/linux/latest/source/arch/x86/include/asm/special_insns.h#L213
   * Intel programming manual at https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
   * Use these for Broadwell CPUs (cervino server)
   */
  #define PWB(addr)              __asm__ volatile("clflush (%0)" :: "r" (addr) : "memory")                      // Broadwell only works with this.
  #define PFENCE()               {}                                                                             // No ordering fences needed for CLFLUSH (section 7.4.6 of Intel manual)
  #define PSYNC()                {}                                                                             // For durability it's not obvious, but CLFLUSH seems to be enough, and PMDK uses the same approach
#elif PWB_IS_CLWB
  /* Use this for CPUs that support clwb, such as the SkyLake SP series (c5 compute intensive instances in AWS are an example of it) */
  #define PWB(addr)              __asm__ volatile(".byte 0x66; xsaveopt %0" : "+m" (*(volatile char *)(addr)))  // clwb() only for Ice Lake onwards
  #define PFENCE()               __asm__ volatile("sfence" : : : "memory")
  #define PSYNC()                __asm__ volatile("sfence" : : : "memory")
#elif PWB_IS_NOP
  /* pwbs are not needed for shared memory persistency (i.e. persistency across process failure) */
  #define PWB(addr)              {}
  #define PFENCE()               __asm__ volatile("sfence" : : : "memory")
  #define PSYNC()                __asm__ volatile("sfence" : : : "memory")
#elif PWB_IS_CLFLUSHOPT
  /* Use this for CPUs that support clflushopt, which is most recent x86 */
  #define PWB(addr)              __asm__ volatile(".byte 0x66; clflush %0" : "+m" (*(volatile char *)(addr)))    // clflushopt (Kaby Lake)
  #define PFENCE()               __asm__ volatile("sfence" : : : "memory")
  #define PSYNC()                __asm__ volatile("sfence" : : : "memory")
#else
#error "You must define what PWB is. Choose PWB_IS_CLFLUSHOPT if you don't know what your CPU is capable of"
#endif

// Flush each cache line in a range
// TODO: fix cache alignment
inline static void flushFromTo(void* from, void* to) noexcept {
    const int cache_line_size = 64;
    uint8_t* ptr = (uint8_t*)from;
    for (; ptr < (uint8_t*)to; ptr += cache_line_size) PWB(ptr);
}

// TODO: Implement fences for ARM


#endif


================================================
FILE: datastructures/generic/TMHashMap.hpp
================================================
/******************************************************************************
 * Copyright (c) 2014-2018, Pedro Ramalhete, Andreia Correia
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of Concurrency Freaks nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************************************************************
 */

#ifndef _TM_NON_RESIZABLE_HASH_MAP_H_
#define _TM_NON_RESIZABLE_HASH_MAP_H_

#include <atomic>
#include <stdexcept>
#include <mutex>

#include "../../stms/tm.h"               // This header defines the macros for the STM being compiled


/**
 * <h1> A Non-Resizable Hash Map for usage with STMs </h1>
 *
 * Each node contains 4 entries (key/value) so as to provide better cache locality
 *
 *
 * TODO

 *
 * @author Pedro Ramalhete
 * @author Andreia Correia
 */
template<typename K, typename V>
class TMHashMap : public TM_BASE_TYPE {

private:
    // One KeyVal is 16+16 bytes, therefore, 4 KeyVals are 2 cache lines in x86 (128 bytes)
    static const int KV_NUM = 4;
    static const unsigned int MAX_THREADS = 128;
    const unsigned int maxThreads;
    const unsigned int capacity;

    struct KeyVal {
        //uint64_t          h;     // Full hash of the key, for faster comparison. TODO: add code to handle h
        TM_TYPE<K*> key {nullptr};
        TM_TYPE<V*> val {nullptr};
        KeyVal() {}
        KeyVal(K* key, V* value) : key{key}, val{value} { }
    };

    struct Node : TM_BASE_TYPE {
        KeyVal            kv[KV_NUM];
        TM_TYPE<Node*>    next {nullptr};
        Node() {}
        Node(K* key, V* value) {
            kv[0].key = key;
            kv[0].val = value;
        }
        bool isEmpty() {
            for (int i = 0; i < KV_NUM; i++) {
                if (kv[i].key != nullptr) return false;
            }
            return true;
        }
    };

    alignas(128) Node* buckets;      // An array of Nodes


    int myhash(K* key) { return 0; }  // Used only for tests

public:
    TMHashMap(unsigned int maxThreads=MAX_THREADS, unsigned int capacity=2*1024*1024) : maxThreads{maxThreads}, capacity{capacity} {
        buckets = new Node[capacity];
    }


    ~TMHashMap() {
        delete[] buckets;
    }


    std::string className() { return TM_NAME() + "-HashMap"; }


    /*
     * Progress Condition: lock-free
     * Adds a node with a key if the key is not present, otherwise replaces the value.
     * Returns the previous value (nullptr by default).
     */
    V* put(K* key, V* value, const int tid) {
        if (key == nullptr) throw std::invalid_argument("key can not be nullptr");
        if (value == nullptr) throw std::invalid_argument("value can not be nullptr");
        V* oldVal = nullptr;
        KeyVal *firstFree = nullptr;
        auto h = std::hash<K>{}(*key);
        Node* node = &buckets[h];
        while (true) {
            for (int i = 0; i < KV_NUM; i++) {
                KeyVal& kv = node->kv[i];
                if (kv.key == nullptr) {
                    // Save the first available entry, in case we need to insert somewhere
                    if (firstFree == nullptr) firstFree = &kv;
                    continue;
                }
                if (*kv.key != *key) continue;
                // Found a matching key, replace the old value with the new
                oldVal = kv.val;
                kv.val = value;
                return oldVal;
            }
            Node* lnext = node->next;
            if (lnext == nullptr) break;
            node = lnext;
        }
        // We got here without a replacement, so insert in the first available
        if (firstFree == nullptr) {
            // No available entry, allocate a node and insert it there
            Node* newNode = TM_ALLOC<Node>(key,value);
            node->next = newNode;
        } else {
            firstFree->key = key;
            firstFree->val = value;
        }
        return oldVal;
    }


    /*
     * Progress Condition: lock-free
     * Removes a key, returning the value associated with it.
     * Returns nullptr if there is no matching key.
     */
    V* removeKey(K* key, const int tid) {
        if (key == nullptr) throw std::invalid_argument("key can not be nullptr");
        auto h = std::hash<K>{}(*key);
        Node* node = &buckets[h];
        Node* prev = node;
        while (true) {
            for (int i = 0; i < KV_NUM; i++) {
                KeyVal& kv = node->kv[i];
                if (kv.key == nullptr || *kv.key != *key) continue;
                // Found a matching key, replace the old value with nullptr
                V* oldVal = kv.val;
                kv.val = nullptr;
                kv.key = nullptr;
                // Check if it's the first node and if it is empty, then unlink it and free it
                if (prev != node && node->isEmpty()) {
                    prev->next = node->next;
                    TM_FREE(node);
                }
                return oldVal;
            }
            prev = node;
            node = node->next;
            // We got to the end without a matching key, return nullptr
            if (node == nullptr) return nullptr;
        }
    }


    /*
     * Progress Condition: lock-free
     * Returns the value of associated with the key, nullptr if there is no mapping
     */
    V* get(K* key, const int tid) {
        if (key == nullptr) throw std::invalid_argument("key can not be nullptr");
        auto h = std::hash<K>{}(*key);
        Node* node = &buckets[h];
        while (true) {
            for (int i = 0; i < KV_NUM; i++) {
                KeyVal& kv = node->kv[i];
                if (kv.key == nullptr || *kv.key != *key) continue;
                return kv.val;
            }
            Node* lnext = node->next;
            if (lnext == nullptr) return nullptr;
            node = lnext;
        }
    }


    //
    // Set methods for running the usual tests and benchmarks
    //

    bool add(K* key, const int tid) {
        return TM_WRITE_TRANSACTION<bool>([&] () -> bool {
            return put(key,key, tid) == nullptr;
        });
    }

    bool remove(K* key, const int tid) {
        return TM_WRITE_TRANSACTION<bool>([this,key,tid] () -> bool {
            return removeKey(key, tid) != nullptr;
        });
    }

    bool contains(K* key, const int tid) {
        return TM_READ_TRANSACTION<bool>([this,key,tid] () -> bool {
            return get(key, tid) != nullptr;
        });
    }

    // Used only for benchmarks. It's single-threaded
    bool addAll(K** keys, const int size, const int tid) {
        for (int i = 0; i < size; i++) add(keys[i], tid);
    }

};

#endif /* _TM_NON_RESIZABLE_HASH_MAP_H_ */


================================================
FILE: datastructures/generic/TMLinkedListQueue.hpp
================================================
/******************************************************************************
 * Copyright (c) 2014-2017, Pedro Ramalhete, Andreia Correia
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of Concurrency Freaks nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************************************************************
 */

#ifndef _TM_LINKED_LIST_QUEUE_H_
#define _TM_LINKED_LIST_QUEUE_H_

#include <atomic>
#include <stdexcept>

#include "../../stms/CRWWPSTM.hpp"
#include "../../stms/LeftRightTM.hpp"
#include "../../stms/tm.h"               // This header defines the macros for the STM being compiled
#include "MWCLF.hpp"
#include "MWCWF.hpp"
#include "CXTM.hpp"

/**
 * <h1> A Linked List queue using STM </h1>
 *
 *
 * TODO
 *
 *
 * enqueue algorithm: sequential implementation + MWC
 * dequeue algorithm: sequential implementation + MWC
 * Consistency: Linearizable
 * enqueue() progress: lock-free
 * dequeue() progress: lock-free
 * Memory Reclamation: Hazard Eras (integrated into MWC)
 * enqueue min ops: 2 DCAS + 1 CAS
 * dequeue min ops: 1 DCAS + 1 CAS
 *
 * @author Pedro Ramalhete
 * @author Andreia Correia
 */
template<typename T>
class TMLinkedListQueue {

private:
    static const unsigned int MAX_THREADS = 128;
    const unsigned int maxThreads;

    struct Node : TM_BASE_TYPE {
        T* item;
        TM_TYPE<Node*> next;
        Node(T* userItem) : item{userItem}, next{nullptr} { }
    };

    alignas(128) TM_TYPE<Node*>  head {nullptr};
    alignas(128) TM_TYPE<Node*>  tail {nullptr};


public:
    TMLinkedListQueue(unsigned int maxThreads=MAX_THREADS) : maxThreads{maxThreads} {
        Node* sentinelNode = TM_ALLOC<Node>(nullptr);
        head = sentinelNode;
        tail = sentinelNode;
    }


    ~TMLinkedListQueue() {
        // TODO: replace this 0 with the actual tid otherwise we could have issues
        while (dequeue(0) != nullptr); // Drain the queue
        Node* lhead = head;
        delete lhead;
    }


    static std::string className() { return TM_NAME() + "-LinkedListQueue"; }


    /*
     *
     * Always returns true
     */
    bool enqueue(T* item, const int tid) {
        if (item == nullptr) throw std::invalid_argument("item can not be nullptr");
        return TM_WRITE_TRANSACTION<bool>([this,item] () -> bool {
                Node* newNode = TM_ALLOC<Node>(item);
                tail->next = newNode;
                tail = newNode;
                return true;
            });
    }


    /*
     *
     */
    T* dequeue(const int tid) {
        return TM_WRITE_TRANSACTION<T*>([this] () -> T* {
                Node* lhead = head;
                if (lhead == tail) return nullptr;
                head = lhead->next;
                TM_FREE(lhead);
                return head->item;
            });
    }
};

#endif /* _MWC_LINKED_LIST_QUEUE_H_ */


================================================
FILE: datastructures/generic/TMLinkedListSet.hpp
================================================
/******************************************************************************
 * Copyright (c) 2014-2018, Pedro Ramalhete, Andreia Correia
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of Concurrency Freaks nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************************************************************
 */

#ifndef _TM_LINKED_LIST_SET_H_
#define _TM_LINKED_LIST_SET_H_

#include <atomic>
#include <stdexcept>

#include "../../stms/tm.h"               // This header defines the macros for the STM being compiled


/**
 * <h1> A Linked List Set for usage with STMs </h1>
 *
 * TODO
 *
 *
 * @author Pedro Ramalhete
 * @author Andreia Correia
 */
template<typename T>
class TMLinkedListSet : public TM_BASE_TYPE {

private:
    static const unsigned int MAX_THREADS = 128;
    const unsigned int maxThreads;

    struct Node : public TM_BASE_TYPE {
        T* key;
        TM_TYPE<Node*> next;
        Node(T* key) : key{key}, next{nullptr} { }
    };

    alignas(128) TM_TYPE<Node*>  head {nullptr};
    alignas(128) TM_TYPE<Node*>  tail {nullptr};


public:
    TMLinkedListSet(unsigned int maxThreads=MAX_THREADS) : maxThreads{maxThreads} {
        Node* lhead = new Node(nullptr);
        Node* ltail = new Node(nullptr);
        head = lhead;
        head->next = ltail;
        tail = ltail;
    }


    ~TMLinkedListSet() {
        // Delete all the nodes in the list
        Node* prev = head;
        Node* node = prev->next;
        while (node != tail) {
            delete prev;
            prev = node;
            node = node->next;
        }
        delete prev;
        delete tail;
    }


    static std::string className() { return TM_NAME() + "-LinkedListSet"; }

#ifdef TINY_STM
    /*
     * Progress Condition: lock-free
     * Adds a node with a key, returns false if the key is already in the set
     */
    bool add(T* key, const int tid) {
        if (key == nullptr) throw std::invalid_argument("key can not be nullptr");
        bool retval = false;
        WRITE_TX_BEGIN
        Node* newNode = TM_ALLOC<Node>(key);
        Node* prev = head;
        Node* node = prev->next;
        while (true) {
            if (node == tail) {
                prev->next = newNode;
                newNode->next = node;
                retval = true;
                break;
            }
            if (*key == *node->key) {
                TM_FREE(newNode); // If the key was already in the set, free the node that was never used
                break;
            }
            if (*(node->key) < *key) {
                prev->next = newNode;
                newNode->next = node;
                retval = true;
                break;
            }
            prev = node;
            node = node->next;
        }
        WRITE_TX_END
        return retval;
    }


    /*
     * Progress Condition: lock-free
     * Removes a node with an key, returns false if the key is not in the set
     */
    bool remove(T* key, const int tid) {
        if (key == nullptr) throw std::invalid_argument("key can not be nullptr");
        bool retval = false;
        WRITE_TX_BEGIN
        Node* prev = head;
        Node* node = prev->next;
        while (true) {
            if (node == tail) break;
            if (*key == *node->key) {
                prev->next = node->next;
                TM_FREE(node);
                retval = true;
                break;
            }
            if (*(node->key) < *key) break;
            prev = node;
            node = node->next;
        }
        WRITE_TX_END
        return retval;
    }


    /*
     * Progress Condition: lock-free
     * Returns true if it finds a node with a matching key
     */
    bool contains(T* key, const int tid) {
        if (key == nullptr) throw std::invalid_argument("key can not be nullptr");
        bool retval = false;
        READ_TX_BEGIN
        Node* node = head->next;
        while (true) {
            if (node == tail) break;
            if (*key == *node->key) {retval = true; break; }
            if (*(node->key) < *key) break;
            node = node->next;
        }
        READ_TX_END
        return retval;
    }

#else

    /*
     * Progress Condition: lock-free
     * Adds a node with a key, returns false if the key is already in the set
     */
    bool add(T* key, const int tid) {
        if (key == nullptr) throw std::invalid_argument("key can not be nullptr");
        return TM_WRITE_TRANSACTION<bool>([this,key] () -> bool {
                Node* newNode = TM_ALLOC<Node>(key);
                Node* prev = head;
                Node* node = prev->next;
                while (true) {
                    if (node == tail) break;
                    if (*key == *node->key) {
                        TM_FREE(newNode); // If the key was already in the set, free the node that was never used
                        return false;
                    }
                    if (*(node->key) < *key) break;
                    prev = node;
                    node = node->next;
                }
                prev->next = newNode;
                newNode->next = node;
                return true;
            });
    }


    /*
     * Progress Condition: lock-free
     * Removes a node with an key, returns false if the key is not in the set
     */
    bool remove(T* key, const int tid) {
        if (key == nullptr) throw std::invalid_argument("key can not be nullptr");
        return TM_WRITE_TRANSACTION<bool>([this,key] () -> bool {
                Node* prev = head;
                Node* node = prev->next;
                while (true) {
                    if (node == tail) return false;
                    if (*key == *node->key) {
                        prev->next = node->next;
                        TM_FREE(node);
                        return true;
                    }
                    if (*(node->key) < *key) return false;
                    prev = node;
                    node = node->next;
                }
            });
    }


    /*
     * Progress Condition: lock-free
     * Returns true if it finds a node with a matching key
     */
    bool contains(T* key, const int tid) {
        if (key == nullptr) throw std::invalid_argument("key can not be nullptr");
        return TM_READ_TRANSACTION<bool>([this,key] () -> bool {
                Node* node = head->next;
                while (true) {
                    if (node == tail) return false;
                    if (*key == *node->key) return true;
                    if (*(node->key) < *key) return false;
                    node = node->next;
                }
            });
    }
#endif

    bool addAll(T** keys, int size, const int tid) {
        for (int i = 0; i < size; i++) add(keys[i], tid);
    }
};

#endif /* _TM_LINKED_LIST_SET_H_ */


================================================
FILE: datastructures/generic/TMRedBlackBST.hpp
================================================
#ifndef _TM_RED_BLACK_BST_H_
#define _TM_RED_BLACK_BST_H_

#include <cassert>
#include <stdexcept>
#include <algorithm>

#include "../../stms/tm.h"               // This header defines the macros for the STM being compiled

static const int64_t RED   = 0;
static const int64_t BLACK = 1;

//http://algs4.cs.princeton.edu/code/edu/princeton/cs/algs4/RedBlackBST.java
template<typename K, typename V>
class TMRedBlackBST : public TM_BASE_TYPE {

    struct Node : TM_BASE_TYPE {
        TM_TYPE<K*>      key;
        TM_TYPE<V*>      val;
        TM_TYPE<Node*>   left {nullptr};
        TM_TYPE<Node*>   right {nullptr};
        TM_TYPE<int64_t> color;    // color of parent link
        TM_TYPE<int64_t> size;     // subtree count
        Node(K* key, V* val, int64_t color, int64_t size) : key{key}, val{val}, color{color}, size{size} {}
    };

    TM_TYPE<Node*> root {nullptr};   // root of the BST

    const unsigned int maxThreads;

    inline void assignAndFreeIfNull(TM_TYPE<Node*>& z, Node* w) {
        Node* tofree = z;
        z = w;
        if (w == nullptr) TM_FREE(tofree);
    }

public:
    /**
     * Initializes an empty symbol table.
     */
    TMRedBlackBST(unsigned int maxThreads=128) : maxThreads{maxThreads} { }

    /***************************************************************************
     *  Node helper methods.
     ***************************************************************************/
    // is node x red; false if x is null ?
    bool isRed(Node* x) {
        if (x == nullptr) return false;
        return x->color == RED;
    }

    // number of node in subtree rooted at x; 0 if x is null
    int size(Node* x) {
        if (x == nullptr) return 0;
        return x->size;
    }


    /**
     * Returns the number of key-value pairs in this symbol table.
     * @return the number of key-value pairs in this symbol table
     */
    int size() {
        return size(root);
    }

    /**
     * Is this symbol table empty?
     * @return {@code true} if this symbol table is empty and {@code false} otherwise
     */
    bool isEmpty() {
        return root == nullptr;
    }


    /***************************************************************************
     *  Standard BST search->
     ***************************************************************************/

    /**
     * Returns the value associated with the given key.
     * @param key the key
     * @return the value associated with the given key if the key is in the symbol table
     *     and {@code null} if the key is not in the symbol table
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    V* get(K* key) {
        if (key == nullptr) throw std::invalid_argument("item can not be nullptr");
        return get(root, key);
    }

    // value associated with the given key in subtree rooted at x; null if no such key
    V* get(Node* x, K* key) {
        while (x != nullptr) {
            if      (*key < *x->key) x = x->left;
            else if (*x->key < *key) x = x->right;
            else              return x->val;
        }
        return nullptr;
    }

    /**
     * Does this symbol table contain the given key?
     * @param key the key
     * @return {@code true} if this symbol table contains {@code key} and
     *     {@code false} otherwise
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    bool contains(K* key) {
        return get(key) != nullptr;
    }

    /***************************************************************************
     *  Red-black tree insertion.
     ***************************************************************************/

    /**
     * Inserts the specified key-value pair into the symbol table, overwriting the old
     * value with the new value if the symbol table already contains the specified key.
     * Deletes the specified key (and its associated value) from this symbol table
     * if the specified value is {@code null}.
     *
     * @param key the key
     * @param val the value
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    void put(K* key, V* val) {
        if (key == nullptr) throw std::invalid_argument("item can not be nullptr");
        if (val == nullptr) {
            deleteKey(key);
            return;
        }
        root = put(root, key, val);
        root->color = BLACK;
        // assert check();
    }

    // insert the key-value pair in the subtree rooted at h
    Node* put(Node* h, K* key, V* val) {
        if (h == nullptr) return TM_ALLOC<Node>(key, val, RED, 1);
        if      (*key < *h->key) h->left  = put(h->left,  key, val);
        else if (*h->key < *key) h->right = put(h->right, key, val);
        else              h->val   = val;
        // fix-up any right-leaning links
        if (isRed(h->right) && !isRed(h->left))       h = rotateLeft(h);
        if (isRed(h->left)  &&  isRed(h->left->left)) h = rotateRight(h);
        if (isRed(h->left)  &&  isRed(h->right))      flipColors(h);
        h->size = size(h->left) + size(h->right) + 1;

        return h;
    }

    /***************************************************************************
     *  Red-black tree deletion.
     ***************************************************************************/

    /**
     * Removes the smallest key and associated value from the symbol table.
     * @throws NoSuchElementException if the symbol table is empty
     */
    void deleteMin() {
        if (isEmpty()) throw std::invalid_argument("item can not be nullptr");
        // if both children of root are black, set root to red
        if (!isRed(root->left) && !isRed(root->right))
            root->color = RED;
        assignAndFreeIfNull(root, deleteMin(root));
        if (!isEmpty()) root->color = BLACK;
        // assert check();
    }

    // delete the key-value pair with the minimum key rooted at h
    Node* deleteMin(Node* h) {
        if (h->left == nullptr)
            return nullptr;
        if (!isRed(h->left) && !isRed(h->left->left))
            h = moveRedLeft(h);
        assignAndFreeIfNull(h->left, deleteMin(h->left));
        return balance(h);
    }


    /**
     * Removes the largest key and associated value from the symbol table.
     * @throws NoSuchElementException if the symbol table is empty
     */
    void deleteMax() {
        if (isEmpty()) throw std::invalid_argument("item can not be nullptr");

        // if both children of root are black, set root to red
        if (!isRed(root->left) && !isRed(root->right))
            root->color = RED;

        root = deleteMax(root);
        if (!isEmpty()) root->color = BLACK;
        // assert check();
    }

    // delete the key-value pair with the maximum key rooted at h
    Node* deleteMax(Node* h) {
        if (isRed(h->left))
            h = rotateRight(h);

        if (h->right == nullptr)
            return nullptr;

        if (!isRed(h->right) && !isRed(h->right->left))
            h = moveRedRight(h);

        h->right = deleteMax(h->right);

        return balance(h);
    }

    /**
     * Removes the specified key and its associated value from this symbol table
     * (if the key is in this symbol table).
     *
     * @param  key the key
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    void deleteKey(K* key) {
        if (key == nullptr) throw std::invalid_argument("item can not be nullptr");
        if (!contains(key)) return;
        // if both children of root are black, set root to red
        if (!isRed(root->left) && !isRed(root->right))
            root->color = RED;
        assignAndFreeIfNull(root, deleteKey(root, key));
        if (!isEmpty()) root->color = BLACK;
        // assert check();
    }

    // delete the key-value pair with the given key rooted at h
    Node* deleteKey(Node* h, K* key) {
        // assert get(h, key) != null;
        if (*key < *h->key)  {
            if (!isRed(h->left) && !isRed(h->left->left)) {
                h = moveRedLeft(h);
            }
            assignAndFreeIfNull(h->left, deleteKey(h->left, key));
        } else {
            if (isRed(h->left)) {
                h = rotateRight(h);
            }
            if (*key == *h->key && (h->right == nullptr)) {
                return nullptr;
            }
            if (!isRed(h->right) && !isRed(h->right->left)) {
                h = moveRedRight(h);
            }
            if (*key == *h->key) {
                Node* x = min(h->right);
                h->key = x->key;
                h->val = x->val;
                // h->val = get(h->right, min(h->right).key);
                // h->key = min(h->right).key;
                assignAndFreeIfNull(h->right, deleteMin(h->right));
            } else {
                assignAndFreeIfNull(h->right, deleteKey(h->right, key));
            }
        }
        return balance(h);
    }

    /***************************************************************************
     *  Red-black tree helper functions.
     ***************************************************************************/

    // make a left-leaning link lean to the right
    Node* rotateRight(Node* h) {
        // assert (h != null) && isRed(h->left);
        Node* x = h->left;
        h->left = x->right;
        x->right = h;
        x->color = x->right->color;
        x->right->color = RED;
        x->size = h->size;
        h->size = size(h->left) + size(h->right) + 1;
        return x;
    }

    // make a right-leaning link lean to the left
    Node* rotateLeft(Node* h) {
        // assert (h != null) && isRed(h->right);
        Node* x = h->right;
        h->right = x->left;
        x->left = h;
        x->color = x->left->color;
        x->left->color = RED;
        x->size = h->size;
        h->size = size(h->left) + size(h->right) + 1;
        return x;
    }

    // flip the colors of a node and its two children
    void flipColors(Node* h) {
        // h must have opposite color of its two children
        // assert (h != null) && (h->left != null) && (h->right != null);
        // assert (!isRed(h) &&  isRed(h->left) &&  isRed(h->right))
        //    || (isRed(h)  && !isRed(h->left) && !isRed(h->right));
        h->color = !h->color;
        h->left->color = !h->left->color;
        h->right->color = !h->right->color;
    }

    // Assuming that h is red and both h->left and h->left.left
    // are black, make h->left or one of its children red.
    Node* moveRedLeft(Node* h) {
        // assert (h != null);
        // assert isRed(h) && !isRed(h->left) && !isRed(h->left.left);

        flipColors(h);
        if (isRed(h->right->left)) {
            h->right = rotateRight(h->right);
            h = rotateLeft(h);
            flipColors(h);
        }
        return h;
    }

    // Assuming that h is red and both h->right and h->right.left
    // are black, make h->right or one of its children red.
    Node* moveRedRight(Node* h) {
        // assert (h != null);
        // assert isRed(h) && !isRed(h->right) && !isRed(h->right.left);
        flipColors(h);
        if (isRed(h->left->left)) {
            h = rotateRight(h);
            flipColors(h);
        }
        return h;
    }

    // restore red-black tree invariant
    Node* balance(Node* h) {
        // assert (h != null);

        if (isRed(h->right))                        h = rotateLeft(h);
        if (isRed(h->left) && isRed(h->left->left)) h = rotateRight(h);
        if (isRed(h->left) && isRed(h->right))      flipColors(h);

        h->size = size(h->left) + size(h->right) + 1;
        return h;
    }


    /***************************************************************************
     *  Utility functions.
     ***************************************************************************/

    /**
     * Returns the height of the BST (for debugging).
     * @return the height of the BST (a 1-node tree has height 0)
     */
    int height() {
        return height(root);
    }
    int height(Node* x) {
        if (x == nullptr) return -1;
        return 1 + std::max(height(x->left), height(x->right));
    }

    /***************************************************************************
     *  Ordered symbol table methods.
     ***************************************************************************/

    /**
     * Returns the smallest key in the symbol table.
     * @return the smallest key in the symbol table
     * @throws NoSuchElementException if the symbol table is empty
     */
    K* min() {
        if (isEmpty()) throw std::invalid_argument("item can not be nullptr");
        return min(root).key;
    }

    // the smallest key in subtree rooted at x; null if no such key
    Node* min(Node* x) {
        // assert x != null;
        if (x->left == nullptr) return x;
        else                return min(x->left);
    }

    /**
     * Returns the largest key in the symbol table.
     * @return the largest key in the symbol table
     * @throws NoSuchElementException if the symbol table is empty
     */
    K* max() {
        if (isEmpty()) throw std::invalid_argument("item can not be nullptr");
        return max(root).key;
    }

    // the largest key in the subtree rooted at x; null if no such key
    Node* max(Node* x) {
        // assert x != null;
        if (x->right == nullptr) return x;
        else                 return max(x->right);
    }


    /**
     * Returns the largest key in the symbol table less than or equal to {@code key}.
     * @param key the key
     * @return the largest key in the symbol table less than or equal to {@code key}
     * @throws NoSuchElementException if there is no such key
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    K* floor(K* key) {
        if (key == nullptr) throw std::invalid_argument("item can not be nullptr");
        if (isEmpty()) throw std::invalid_argument("item can not be nullptr");
        Node* x = floor(root, key);
        if (x == nullptr) return nullptr;
        else           return x->key;
    }

    // the largest key in the subtree rooted at x less than or equal to the given key
    Node* floor(Node* x, K* key) {
        if (x == nullptr) return nullptr;
        if (*key == *x->key) return x;
        if (*key < *x->key)  return floor(x->left, key);
        Node* t = floor(x->right, key);
        if (t != nullptr) return t;
        else           return x;
    }

    /**
     * Returns the smallest key in the symbol table greater than or equal to {@code key}.
     * @param key the key
     * @return the smallest key in the symbol table greater than or equal to {@code key}
     * @throws NoSuchElementException if there is no such key
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    K* ceiling(K* key) {
        if (key == nullptr) throw std::invalid_argument("item can not be nullptr");
        if (isEmpty()) throw std::invalid_argument("item can not be nullptr");
        Node* x = ceiling(root, key);
        if (x == nullptr) return nullptr;
        else           return x->key;
    }

    // the smallest key in the subtree rooted at x greater than or equal to the given key
    Node* ceiling(Node* x, K* key) {
        if (x == nullptr) return nullptr;
        if (*key == *x->key) return x;
        if (*x->key < *key)  return ceiling(x->right, key);
        Node* t = ceiling(x->left, key);
        if (t != nullptr) return t;
        else           return x;
    }

    /**
     * Return the kth smallest key in the symbol table.
     * @param k the order statistic
     * @return the {@code k}th smallest key in the symbol table
     * @throws IllegalArgumentException unless {@code k} is between 0 and
     *     <em>n</em>1
     */
    K* select(int k) {
        if (k < 0 || k >= size()) {
            throw std::invalid_argument("item can not be nullptr");
        }
        Node x = select(root, k);
        return x->key;
    }

    // the key of rank k in the subtree rooted at x
    Node* select(Node* x, int k) {
        // assert x != null;
        // assert k >= 0 && k < size(x);
        int t = size(x->left);
        if      (t > k) return select(x->left,  k);
        else if (t < k) return select(x->right, k-t-1);
        else            return x;
    }

    /**
     * Return the number of keys in the symbol table strictly less than {@code key}.
     * @param key the key
     * @return the number of keys in the symbol table strictly less than {@code key}
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    int rank(K* key) {
        if (key == nullptr) throw std::invalid_argument("item can not be nullptr");
        return rank(key, root);
    }

    // number of keys less than key in the subtree rooted at x
    int rank(K* key, Node* x) {
        if (x == nullptr) return 0;
        if      (*key < *x->key) return rank(key, x->left);
        else if (*x->key < *key) return 1 + size(x->left) + rank(key, x->right);
        else              return size(x->left);
    }

    /***************************************************************************
     *  Range count and range search->
     ***************************************************************************/


    /**
     * Returns the number of keys in the symbol table in the given range.
     *
     * @param  lo minimum endpoint
     * @param  hi maximum endpoint
     * @return the number of keys in the sybol table between {@code lo}
     *    (inclusive) and {@code hi} (inclusive)
     * @throws IllegalArgumentException if either {@code lo} or {@code hi}
     *    is {@code null}
     */
    int size(K* lo, K* hi) {
        if (lo == nullptr) throw std::invalid_argument("item can not be nullptr");
        if (hi == nullptr) throw std::invalid_argument("item can not be nullptr");

        if (*hi < *lo) return 0;
        if (contains(hi)) return rank(hi) - rank(lo) + 1;
        else              return rank(hi) - rank(lo);
    }


    /***************************************************************************
     *  Check integrity of red-black tree data structure.
     ***************************************************************************/
    bool check() {
        if (!isBST())            std::cout << "Not in symmetric order\n";
        if (!isSizeConsistent()) std::cout << "Subtree counts not consistent\n";
        //if (!isRankConsistent()) std::cout << "Ranks not consistent\n";
        if (!is23())             std::cout << "Not a 2-3 tree\n";
        if (!isBalanced())       std::cout << "Not balanced\n";
        return isBST() && isSizeConsistent() && is23() && isBalanced();
    }

    // does this binary tree satisfy symmetric order?
    // Note: this test also ensures that data structure is a binary tree since order is strict
    bool isBST() {
        return isBST(root, nullptr, nullptr);
    }

    // is the tree rooted at x a BST with all keys strictly between min and max
    // (if min or max is null, treat as empty constraint)
    // Credit: Bob Dondero's elegant solution
    bool isBST(Node* x, K* min, K* max) {
        if (x == nullptr) return true;
        // TODO: port these two lines
        //if (min != nullptr && x->key.compareTo(min) <= 0) return false;
        //if (max != nullptr && x->key.compareTo(max) >= 0) return false;
        return isBST(x->left, min, x->key) && isBST(x->right, x->key, max);
    }

    // are the size fields correct?
    bool isSizeConsistent() { return isSizeConsistent(root); }
    bool isSizeConsistent(Node* x) {
        if (x == nullptr) return true;
        if (x->size != size(x->left) + size(x->right) + 1) return false;
        return isSizeConsistent(x->left) && isSizeConsistent(x->right);
    }

    /*
    // check that ranks are consistent
    bool isRankConsistent() {
        for (int i = 0; i < size(); i++)
            if (i != rank(select(i))) return false;
        for (K* key : keys())
            if (key.compareTo(select(rank(key))) != 0) return false;
        return true;
    }
    */

    // Does the tree have no red right links, and at most one (left)
    // red links in a row on any path?
    bool is23() { return is23(root); }
    bool is23(Node* x) {
        if (x == nullptr) return true;
        if (isRed(x->right)) return false;
        if (x != root && isRed(x) && isRed(x->left))
            return false;
        return is23(x->left) && is23(x->right);
    }

    // do all paths from root to leaf have same number of black edges?
    bool isBalanced() {
        int black = 0;     // number of black links on path from root to min
        Node x = root;
        while (x != nullptr) {
            if (!isRed(x)) black++;
            x = x->left;
        }
        return isBalanced(root, black);
    }

    // does every path from the root to a leaf have the given number of black links?
    bool isBalanced(Node* x, int black) {
        if (x == nullptr) return black == 0;
        if (!isRed(x)) black--;
        return isBalanced(x->left, black) && isBalanced(x->right, black);
    }


    // Set methods
    bool add(K* key, const int tid) {
        return TM_WRITE_TRANSACTION<bool>([this,key] () -> bool {
            if (contains(key)) return false;
            put(key,key);
            return true;
        });
    }

    bool remove(K* key, const int tid) {
        return TM_WRITE_TRANSACTION<bool>([this,key] () -> bool {
            if (!contains(key)) return false;
            deleteKey(key);
            return true;
        });
    }

    inline bool contains(K* key, const int tid) {
        return TM_READ_TRANSACTION<bool>([this,key] () -> bool {
            return contains(key);
        });
    }

    // This is not fully transactionally but it's ok because we use it only on initialization.
    // We could make it fully transactionally, but we would have to increase the size of allocation/store logs.
    bool addAll(K** keys, int size, const int tid) {
        for (int i = 0; i < size; i++) add(keys[i], tid);
    }

    std::string className() { return TM_NAME() + "-RedBlackBST"; }

};

#endif   // _TM_RED_BLACK_BST_H_


================================================
FILE: datastructures/hashmaps/CRWWPSTMResizableHashSet.hpp
================================================
/*
 * Copyright 2017-2018
 *   Andreia Correia <andreia.veiga@unine.ch>
 *   Pedro Ramalhete <pramalhe@gmail.com>
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Nachshon Cohen <nachshonc@gmail.com>
 *
 * This work is published under the MIT license. See LICENSE.txt
 */
#ifndef _CRWWP_STM_RESIZABLE_HASH_MAP_H_
#define _CRWWP_STM_RESIZABLE_HASH_MAP_H_

#include <string>

#include "stms/CRWWPSTM.hpp"

/**
 * <h1> A Resizable Hash Map for usage with STMs </h1>
 * TODO
 *
 */
template<typename K>
class CRWWPSTMResizableHashSet {

private:
    struct Node : public crwwpstm::tmbase {
        crwwpstm::tmtype<K>     key;
        crwwpstm::tmtype<Node*> next {nullptr};
        Node(const K& k) : key{k} { } // Copy constructor for k
    };

    crwwpstm::tmtype<long>                         capacity;
    crwwpstm::tmtype<long>                         sizeHM = 0;
    static constexpr double                        loadFactor = 0.75;
    crwwpstm::tmtype<crwwpstm::tmtype<Node*>*>     buckets;      // An array of pointers to Nodes


public:
    CRWWPSTMResizableHashSet(int maxThreads=0, int capacity=4) : capacity{capacity} {
        crwwpstm::updateTx([&] () {
            buckets = (crwwpstm::tmtype<Node*>*)crwwpstm::tmMalloc(capacity*sizeof(crwwpstm::tmtype<Node*>));
            for (int i = 0; i < capacity; i++) buckets[i] = nullptr;
        });
    }


    ~CRWWPSTMResizableHashSet() {
        crwwpstm::updateTx([&] () {
            for(int i = 0; i < capacity; i++){
                Node* node = buckets[i];
                while (node != nullptr) {
                    Node* next = node->next;
                    crwwpstm::tmDelete(node);
                    node = next;
                }
            }
            crwwpstm::tmFree(buckets.load());
        });
    }


    static std::string className() { return crwwpstm::CRWWPSTM::className() + "-HashMap"; }


    void rebuild() {
        int newcapacity = 2*capacity;
        crwwpstm::tmtype<Node*>* newbuckets = (crwwpstm::tmtype<Node*>*)crwwpstm::tmMalloc(newcapacity*sizeof(crwwpstm::tmtype<Node*>));
        for (int i = 0; i < newcapacity; i++) newbuckets[i] = nullptr;
        for (int i = 0; i < capacity; i++) {
            Node* node = buckets[i];
            while(node!=nullptr){
                Node* next = node->next;
                auto h = std::hash<K>{}(node->key) % newcapacity;
                node->next = newbuckets[h];
                newbuckets[h] = node;
                node = next;
            }
        }
        crwwpstm::tmFree(buckets.load());
        buckets = newbuckets;
        capacity = newcapacity;
    }


    /*
     * Adds a node with a key if the key is not present, otherwise replaces the value.
     * If saveOldValue is set, it will set 'oldValue' to the previous value, iff there was already a mapping.
     *
     * Returns true if there was no mapping for the key, false if there was already a value and it was replaced.
     */
    bool innerPut(const K& key) {
        if (sizeHM > capacity*loadFactor) rebuild();
        auto h = std::hash<K>{}(key) % capacity;
        Node* node = buckets[h];
        Node* prev = node;
        while (true) {
            if (node == nullptr) {
                Node* newnode = crwwpstm::tmNew<Node>(key);
                if (node == prev) {
                    buckets[h] = newnode;
                } else {
                    prev->next = newnode;
                }
                sizeHM++;
                return true;  // New insertion
            }
            if (key == node->key) return false;
            prev = node;
            node = node->next;
        }
    }


    /*
     * Removes a key and its mapping.
     * Saves the value in 'oldvalue' if 'saveOldValue' is set.
     *
     * Returns returns true if a matching key was found
     */
    bool innerRemove(const K& key) {
        auto h = std::hash<K>{}(key) % capacity;
        Node* node = buckets[h];
        Node* prev = node;
        while (true) {
            if (node == nullptr) return false;
            if (key == node->key) {
                if (node == prev) {
                    buckets[h] = node->next;
                } else {
                    prev->next = node->next;
                }
                sizeHM--;
                crwwpstm::tmDelete(node);
                return true;
            }
            prev = node;
            node = node->next;
        }
    }


    /*
     * Returns true if key is present. Saves a copy of 'value' in 'oldValue' if 'saveOldValue' is set.
     */
    bool innerGet(const K& key) {
        auto h = std::hash<K>{}(key) % capacity;
        Node* node = buckets[h];
        while (true) {
            if (node == nullptr) return false;
            if (key == node->key) return true;
            node = node->next;
        }
    }


    //
    // Set methods for running the usual tests and benchmarks
    //

    // Inserts a key only if it's not already present
    bool add(K key, const int tid=0) {
        return crwwpstm::updateTx<bool>([&] () {
            return innerPut(key);
        });
    }

    // Returns true only if the key was present
    bool remove(K key, const int tid=0) {
        return crwwpstm::updateTx<bool>([&] () {
            return innerRemove(key);
        });
    }

    bool contains(K key, const int tid=0) {
        return crwwpstm::readTx<bool>([&] () {
            return innerGet(key);
        });
    }

    // Used only for benchmarks
    void addAll(K** keys, const int size, const int tid=0) {
        for (int i = 0; i < size; i++) add(*keys[i]);
    }
};

#endif /* _CRWWP_STM_RESIZABLE_HASH_MAP_H_ */


================================================
FILE: datastructures/hashmaps/ESTMResizableHashSet.hpp
================================================
/*
 * Copyright 2017-2018
 *   Andreia Correia <andreia.veiga@unine.ch>
 *   Pedro Ramalhete <pramalhe@gmail.com>
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Nachshon Cohen <nachshonc@gmail.com>
 *
 * This work is published under the MIT license. See LICENSE.txt
 */
#ifndef _ESTM_RESIZABLE_HASH_MAP_H_
#define _ESTM_RESIZABLE_HASH_MAP_H_

#include <string>

#include "stms/ESTM.hpp"

/**
 * <h1> A Resizable Hash Map for usage with STMs </h1>
 * TODO
 *
 */
template<typename K>
class ESTMResizableHashSet {

private:
    struct Node : public estm::tmbase {
        estm::tmtype<K>     key;
        estm::tmtype<Node*> next {nullptr};
        Node(const K& k) : key{k} { } // Copy constructor for k
    };

    estm::tmtype<uint64_t>                     capacity;
    estm::tmtype<uint64_t>                     sizeHM = 0;
    static constexpr double                    loadFactor = 0.75;
    estm::tmtype<estm::tmtype<Node*>*>         buckets;      // An array of pointers to Nodes


public:
    ESTMResizableHashSet(int maxThreads=0, uint64_t capacity=4) : capacity{capacity} {
        estm::updateTx([&] () {
            buckets = (estm::tmtype<Node*>*)estm::tmMalloc(capacity*sizeof(estm::tmtype<Node*>));
            for (int i = 0; i < capacity; i++) buckets[i] = nullptr;
        });
    }


    ~ESTMResizableHashSet() {
        estm::updateTx([&] () {
            for(int i = 0; i < capacity; i++){
                Node* node = buckets[i];
                while (node != nullptr) {
                    Node* next = node->next;
                    estm::tmDelete(node);
                    node = next;
                }
            }
            estm::tmFree(buckets.load());
        });
    }


    static std::string className() { return estm::ESTM::className() + "-HashMap"; }


    void rebuild() {
        uint64_t newcapacity = 2*capacity;
        estm::tmtype<Node*>* newbuckets = (estm::tmtype<Node*>*)estm::tmMalloc(newcapacity*sizeof(estm::tmtype<Node*>));
        for (int i = 0; i < newcapacity; i++) newbuckets[i] = nullptr;
        for (int i = 0; i < capacity; i++) {
            Node* node = buckets[i];
            while (node!=nullptr) {
                Node* next = node->next;
                auto h = std::hash<K>{}(node->key) % newcapacity;
                node->next = newbuckets[h];
                newbuckets[h] = node;
                node = next;
            }
        }
        estm::tmFree(buckets);
        buckets = newbuckets;
        capacity = newcapacity;
    }


    /*
     * Adds a node with a key if the key is not present, otherwise replaces the value.
     * If saveOldValue is set, it will set 'oldValue' to the previous value, iff there was already a mapping.
     *
     * Returns true if there was no mapping for the key, false if there was already a value and it was replaced.
     */
    bool innerPut(const K& key) {
        if (sizeHM.load() > capacity.load()*loadFactor) rebuild();
        auto h = std::hash<K>{}(key) % capacity;
        Node* node = buckets[h];
        Node* prev = node;
        while (true) {
            if (node == nullptr) {
                Node* newnode = estm::tmNew<Node>(key);
                if (node == prev) {
                    buckets[h] = newnode;
                } else {
                    prev->next = newnode;
                }
                sizeHM++;
                return true;  // New insertion
            }
            if (key == node->key) return false;
            prev = node;
            node = node->next;
        }
    }


    /*
     * Removes a key and its mapping.
     * Saves the value in 'oldvalue' if 'saveOldValue' is set.
     *
     * Returns returns true if a matching key was found
     */
    bool innerRemove(const K& key) {
        auto h = std::hash<K>{}(key) % capacity;
        Node* node = buckets[h];
        Node* prev = node;
        while (true) {
            if (node == nullptr) return false;
            if (key == node->key) {
                if (node == prev) {
                    buckets[h] = node->next;
                } else {
                    prev->next = node->next;
                }
                sizeHM--;
                estm::tmDelete(node);
                return true;
            }
            prev = node;
            node = node->next;
        }
    }


    /*
     * Returns true if key is present. Saves a copy of 'value' in 'oldValue' if 'saveOldValue' is set.
     */
    bool innerGet(const K& key) {
        auto h = std::hash<K>{}(key) % capacity;
        Node* node = buckets[h];
        while (true) {
            if (node == nullptr) return false;
            if (key == node->key) return true;
            node = node->next;
        }
    }


    //
    // Set methods for running the usual tests and benchmarks
    //

    // Inserts a key only if it's not already present
    bool add(K key, const int tid=0) {
        return estm::updateTx<bool>([&] () {
            return innerPut(key);
        });
    }

    // Returns true only if the key was present
    bool remove(K key, const int tid=0) {
        return estm::updateTx<bool>([&] () {
            return innerRemove(key);
        });
    }

    bool contains(K key, const int tid=0) {
        return estm::readTx<bool>([&] () {
            return innerGet(key);
        });
    }

    // Used only for benchmarks
    void addAll(K** keys, const int size, const int tid=0) {
        for (int i = 0; i < size; i++) add(*keys[i]);
    }
};

#endif /* _ESTM_RESIZABLE_HASH_MAP_H_ */


================================================
FILE: datastructures/hashmaps/OFLFResizableHashSet.hpp
================================================
/*
 * Copyright 2017-2018
 *   Andreia Correia <andreia.veiga@unine.ch>
 *   Pedro Ramalhete <pramalhe@gmail.com>
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Nachshon Cohen <nachshonc@gmail.com>
 *
 * This work is published under the MIT license. See LICENSE.txt
 */
#ifndef _OF_LF_RESIZABLE_HASH_MAP_H_
#define _OF_LF_RESIZABLE_HASH_MAP_H_

#include <string>

#include "stms/OneFileLF.hpp"

/**
 * <h1> A Resizable Hash Map for usage with STMs </h1>
 * TODO
 *
 */
template<typename K>
class OFLFResizableHashSet {

private:
    struct Node : public oflf::tmbase {
        oflf::tmtype<K>     key;
        oflf::tmtype<Node*> next {nullptr};
        Node(const K& k) : key{k} { } // Copy constructor for k
    };

    oflf::tmtype<uint64_t>                     capacity;
    oflf::tmtype<uint64_t>                     sizeHM = 0;
    static constexpr double                         loadFactor = 0.75;
    oflf::tmtype<oflf::tmtype<Node*>*>    buckets;      // An array of pointers to Nodes


public:
    OFLFResizableHashSet(int maxThreads=0, uint64_t capacity=4) : capacity{capacity} {
        oflf::updateTx([&] () {
            buckets = (oflf::tmtype<Node*>*)oflf::tmMalloc(capacity*sizeof(oflf::tmtype<Node*>));
            for (int i = 0; i < capacity; i++) buckets[i] = nullptr;
        });
    }


    ~OFLFResizableHashSet() {
        oflf::updateTx([&] () {
            for (int i = 0; i < capacity; i++){
                Node* node = buckets[i];
                while (node != nullptr) {
                    Node* next = node->next;
                    oflf::tmDelete(node);
                    node = next;
                }
            }
            oflf::tmFree(buckets.pload());
        });
    }


    static std::string className() { return oflf::OneFileLF::className() + "-HashMap"; }


    void rebuild() {
        uint64_t newcapacity = 2*capacity;
        oflf::tmtype<Node*>* newbuckets = (oflf::tmtype<Node*>*)oflf::tmMalloc(newcapacity*sizeof(oflf::tmtype<Node*>));
        for (int i = 0; i < newcapacity; i++) newbuckets[i] = nullptr;
        for (int i = 0; i < capacity; i++) {
            Node* node = buckets[i];
            while (node!=nullptr) {
                Node* next = node->next;
                auto h = std::hash<K>{}(node->key) % newcapacity;
                node->next = newbuckets[h];
                newbuckets[h] = node;
                node = next;
            }
        }
        oflf::tmFree(buckets.pload());
        buckets = newbuckets;
        capacity = newcapacity;
    }


    /*
     * Adds a node with a key if the key is not present, otherwise replaces the value.
     * If saveOldValue is set, it will set 'oldValue' to the previous value, iff there was already a mapping.
     *
     * Returns true if there was no mapping for the key, false if there was already a value and it was replaced.
     */
    bool innerPut(const K& key) {
        if (sizeHM.pload() > capacity.pload()*loadFactor) rebuild();
        auto h = std::hash<K>{}(key) % capacity;
        Node* node = buckets[h];
        Node* prev = node;
        while (true) {
            if (node == nullptr) {
                Node* newnode = oflf::tmNew<Node>(key);
                if (node == prev) {
                    buckets[h] = newnode;
                } else {
                    prev->next = newnode;
                }
                sizeHM++;
                return true;  // New insertion
            }
            if (key == node->key) return false;
            prev = node;
            node = node->next;
        }
    }


    /*
     * Removes a key and its mapping.
     * Saves the value in 'oldvalue' if 'saveOldValue' is set.
     *
     * Returns returns true if a matching key was found
     */
    bool innerRemove(const K& key) {
        auto h = std::hash<K>{}(key) % capacity;
        Node* node = buckets[h];
        Node* prev = node;
        while (true) {
            if (node == nullptr) return false;
            if (key == node->key) {
                if (node == prev) {
                    buckets[h] = node->next;
                } else {
                    prev->next = node->next;
                }
                sizeHM--;
                oflf::tmDelete(node);
                return true;
            }
            prev = node;
            node = node->next;
        }
    }


    /*
     * Returns true if key is present. Saves a copy of 'value' in 'oldValue' if 'saveOldValue' is set.
     */
    bool innerGet(const K& key) {
        auto h = std::hash<K>{}(key) % capacity;
        Node* node = buckets[h];
        while (true) {
            if (node == nullptr) return false;
            if (key == node->key) return true;
            node = node->next;
        }
    }


    //
    // Set methods for running the usual tests and benchmarks
    //

    // Inserts a key only if it's not already present
    bool add(K key, const int tid=0) {
        return oflf::updateTx<bool>([&] () {
            return innerPut(key);
        });
    }

    // Returns true only if the key was present
    bool remove(K key, const int tid=0) {
        return oflf::updateTx<bool>([&] () {
            return innerRemove(key);
        });
    }

    bool contains(K key, const int tid=0) {
        return oflf::readTx<bool>([&] () {
            return innerGet(key);
        });
    }

    // Used only for benchmarks
    void addAll(K** keys, const int size, const int tid=0) {
        for (int i = 0; i < size; i++) add(*keys[i]);
    }
};

#endif /* _OF_LF_RESIZABLE_HASH_MAP_H_ */


================================================
FILE: datastructures/hashmaps/OFWFResizableHashSet.hpp
================================================
/*
 * Copyright 2017-2018
 *   Andreia Correia <andreia.veiga@unine.ch>
 *   Pedro Ramalhete <pramalhe@gmail.com>
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Nachshon Cohen <nachshonc@gmail.com>
 *
 * This work is published under the MIT license. See LICENSE.txt
 */
#ifndef _OF_WF_RESIZABLE_HASH_MAP_H_
#define _OF_WF_RESIZABLE_HASH_MAP_H_

#include <string>

#include "stms/OneFileWF.hpp"

/**
 * <h1> A Resizable Hash Map for usage with STMs </h1>
 * TODO
 *
 */
template<typename K>
class OFWFResizableHashSet {

private:
    struct Node : public ofwf::tmbase {
        ofwf::tmtype<K>     key;
        ofwf::tmtype<Node*> next {nullptr};
        Node(const K& k) : key{k} { } // Copy constructor for k
    };

    ofwf::tmtype<uint64_t>                     capacity;
    ofwf::tmtype<uint64_t>                     sizeHM = 0;
    static constexpr double                         loadFactor = 0.75;
    ofwf::tmtype<ofwf::tmtype<Node*>*>    buckets;      // An array of pointers to Nodes


public:
    OFWFResizableHashSet(int maxThreads=0, uint64_t capacity=4) : capacity{capacity} {
        ofwf::updateTx([&] () {
            buckets = (ofwf::tmtype<Node*>*)ofwf::tmMalloc(capacity*sizeof(ofwf::tmtype<Node*>));
            for (int i = 0; i < capacity; i++) buckets[i] = nullptr;
        });
    }


    ~OFWFResizableHashSet() {
        ofwf::updateTx([=] () {
            for (int i = 0; i < capacity; i++){
                Node* node = buckets[i];
                while (node != nullptr) {
                    Node* next = node->next;
                    ofwf::tmDelete(node);
                    node = next;
                }
            }
            ofwf::tmFree(buckets.pload());
        });
    }


    static std::string className() { return ofwf::OneFileWF::className() + "-HashMap"; }


    void rebuild() {
        uint64_t newcapacity = 2*capacity;
        ofwf::tmtype<Node*>* newbuckets = (ofwf::tmtype<Node*>*)ofwf::tmMalloc(newcapacity*sizeof(ofwf::tmtype<Node*>));
        for (int i = 0; i < newcapacity; i++) newbuckets[i] = nullptr;
        for (int i = 0; i < capacity; i++) {
            Node* node = buckets[i];
            while (node!=nullptr) {
                Node* next = node->next;
                auto h = std::hash<K>{}(node->key) % newcapacity;
                node->next = newbuckets[h];
                newbuckets[h] = node;
                node = next;
            }
        }
        ofwf::tmFree(buckets.pload());
        buckets = newbuckets;
        capacity = newcapacity;
    }


    /*
     * Adds a node with a key if the key is not present, otherwise replaces the value.
     * If saveOldValue is set, it will set 'oldValue' to the previous value, iff there was already a mapping.
     *
     * Returns true if there was no mapping for the key, false if there was already a value and it was replaced.
     */
    bool innerPut(const K& key) {
        if (sizeHM.pload() > capacity.pload()*loadFactor) rebuild();
        auto h = std::hash<K>{}(key) % capacity;
        Node* node = buckets[h];
        Node* prev = node;
        while (true) {
            if (node == nullptr) {
                Node* newnode = ofwf::tmNew<Node>(key);
                if (node == prev) {
                    buckets[h] = newnode;
                } else {
                    prev->next = newnode;
                }
                sizeHM++;
                return true;  // New insertion
            }
            if (key == node->key) return false;
            prev = node;
            node = node->next;
        }
    }


    /*
     * Removes a key and its mapping.
     * Saves the value in 'oldvalue' if 'saveOldValue' is set.
     *
     * Returns returns true if a matching key was found
     */
    bool innerRemove(const K& key) {
        auto h = std::hash<K>{}(key) % capacity;
        Node* node = buckets[h];
        Node* prev = node;
        while (true) {
            if (node == nullptr) return false;
            if (key == node->key) {
                if (node == prev) {
                    buckets[h] = node->next;
                } else {
                    prev->next = node->next;
                }
                sizeHM--;
                ofwf::tmDelete(node);
                return true;
            }
            prev = node;
            node = node->next;
        }
    }


    /*
     * Returns true if key is present. Saves a copy of 'value' in 'oldValue' if 'saveOldValue' is set.
     */
    bool innerGet(const K& key) {
        auto h = std::hash<K>{}(key) % capacity;
        Node* node = buckets[h];
        while (true) {
            if (node == nullptr) return false;
            if (key == node->key) return true;
            node = node->next;
        }
    }


    //
    // Set methods for running the usual tests and benchmarks
    //

    // Inserts a key only if it's not already present
    bool add(K key, const int tid=0) {
        return ofwf::updateTx<bool>([=] () {
            return innerPut(key);
        });
    }

    // Returns true only if the key was present
    bool remove(K key, const int tid=0) {
        return ofwf::updateTx<bool>([=] () {
            return innerRemove(key);
        });
    }

    bool contains(K key, const int tid=0) {
        return ofwf::readTx<bool>([=] () {
            return innerGet(key);
        });
    }

    // Used only for benchmarks
    void addAll(K** keys, const int size, const int tid=0) {
        for (int i = 0; i < size; i++) add(*keys[i]);
    }
};

#endif /* _OF_WF_RESIZABLE_HASH_MAP_H_ */


================================================
FILE: datastructures/hashmaps/TinySTMResizableHashSet.hpp
================================================
/*
 * Copyright 2017-2018
 *   Andreia Correia <andreia.veiga@unine.ch>
 *   Pedro Ramalhete <pramalhe@gmail.com>
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Nachshon Cohen <nachshonc@gmail.com>
 *
 * This work is published under the MIT license. See LICENSE.txt
 */
#ifndef _TINY_STM_RESIZABLE_HASH_MAP_H_
#define _TINY_STM_RESIZABLE_HASH_MAP_H_

#include <string>

#include "stms/TinySTM.hpp"

/**
 * <h1> A Resizable Hash Map for usage with STMs </h1>
 * TODO
 *
 */
template<typename K>
class TinySTMResizableHashSet {

private:
    struct Node : public tinystm::tmbase {
        tinystm::tmtype<K>     key;
        tinystm::tmtype<Node*> next {nullptr};
        Node(const K& k) : key{k} { } // Copy constructor for k
    };

    tinystm::tmtype<uint64_t>                     capacity;
    tinystm::tmtype<uint64_t>                     sizeHM = 0;
    static constexpr double                       loadFactor = 0.75;
    tinystm::tmtype<tinystm::tmtype<Node*>*>      buckets;      // An array of pointers to Nodes


public:
    TinySTMResizableHashSet(int maxThreads=0, uint64_t capacity=4) : capacity{capacity} {
        tinystm::updateTx<bool>([&] () {
            buckets = (tinystm::tmtype<Node*>*)tinystm::tmMalloc(capacity*sizeof(tinystm::tmtype<Node*>));
            for (int i = 0; i < capacity; i++) buckets[i] = nullptr;
            return true;
        });
    }


    ~TinySTMResizableHashSet() {
        tinystm::updateTx<bool>([&] () {
            for(int i = 0; i < capacity; i++){
                Node* node = buckets[i];
                while (node != nullptr) {
                    Node* next = node->next;
                    tinystm::tmDelete(node);
                    node = next;
                }
            }
            tinystm::tmFree(buckets.load());
            return true;
        });
    }


    static std::string className() { return tinystm::TinySTM::className() + "-HashMap"; }


    void rebuild() {
        uint64_t newcapacity = 2*capacity;
        tinystm::tmtype<Node*>* newbuckets = (tinystm::tmtype<Node*>*)tinystm::tmMalloc(newcapacity*sizeof(tinystm::tmtype<Node*>));
        for (int i = 0; i < newcapacity; i++) newbuckets[i] = nullptr;
        for (int i = 0; i < capacity; i++) {
            Node* node = buckets[i];
            while (node!=nullptr) {
                Node* next = node->next;
                auto h = std::hash<K>{}(node->key) % newcapacity;
                node->next = newbuckets[h];
                newbuckets[h] = node;
                node = next;
            }
        }
        tinystm::tmFree(buckets);
        buckets = newbuckets;
        capacity = newcapacity;
    }


    /*
     * Adds a node with a key if the key is not present, otherwise replaces the value.
     * If saveOldValue is set, it will set 'oldValue' to the previous value, iff there was already a mapping.
     *
     * Returns true if there was no mapping for the key, false if there was already a value and it was replaced.
     */
    bool innerPut(const K& key) {
        if (sizeHM > capacity*loadFactor) rebuild();
        auto h = std::hash<K>{}(key) % capacity;
        Node* node = buckets[h];
        Node* prev = node;
        while (true) {
            if (node == nullptr) {
                Node* newnode = tinystm::tmNew<Node>(key);
                if (node == prev) {
                    buckets[h] = newnode;
                } else {
                    prev->next = newnode;
                }
                sizeHM++;
                return true;  // New insertion
            }
            if (key == node->key) return false;
            prev = node;
            node = node->next;
        }
    }


    /*
     * Removes a key and its mapping.
     * Saves the value in 'oldvalue' if 'saveOldValue' is set.
     *
     * Returns returns true if a matching key was found
     */
    bool innerRemove(const K& key) {
        auto h = std::hash<K>{}(key) % capacity;
        Node* node = buckets[h];
        Node* prev = node;
        while (true) {
            if (node == nullptr) return false;
            if (key == node->key) {
                if (node == prev) {
                    buckets[h] = node->next;
                } else {
                    prev->next = node->next;
                }
                sizeHM--;
                tinystm::tmDelete(node);
                return true;
            }
            prev = node;
            node = node->next;
        }
    }


    /*
     * Returns true if key is present. Saves a copy of 'value' in 'oldValue' if 'saveOldValue' is set.
     */
    bool innerGet(const K& key) {
        auto h = std::hash<K>{}(key) % capacity;
        Node* node = buckets[h];
        while (true) {
            if (node == nullptr) return false;
            if (key == node->key) return true;
            node = node->next;
        }
    }


    //
    // Set methods for running the usual tests and benchmarks
    //

    // Inserts a key only if it's not already present
    bool add(K key, const int tid=0) {
        return tinystm::updateTx<bool>([&] () {
            return innerPut(key);
        });
    }

    // Returns true only if the key was present
    bool remove(K key, const int tid=0) {
        return tinystm::updateTx<bool>([&] () {
            return innerRemove(key);
        });
    }

    bool contains(K key, const int tid=0) {
        return tinystm::readTx<bool>([&] () {
            return innerGet(key);
        });
    }

    // Used only for benchmarks
    void addAll(K** keys, const int size, const int tid=0) {
        for (int i = 0; i < size; i++) add(*keys[i]);
    }
};

#endif /* _TINY_STM_RESIZABLE_HASH_MAP_H_ */


================================================
FILE: datastructures/linkedlists/CRWWPLinkedListSet.hpp
================================================
/******************************************************************************
 * Copyright (c) 2014-2018, Pedro Ramalhete, Andreia Correia
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of Concurrency Freaks nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************************************************************
 */

#ifndef _CRWWP_STM_LINKED_LIST_SET_H_
#define _CRWWP_STM_LINKED_LIST_SET_H_

#include <atomic>
#include <stdexcept>
#include "stms/CRWWPSTM.hpp"


/**
 * <h1> A Linked List Set for CRWWP STM (blocking) </h1>
 *
 * TODO
 *
 *
 * @author Pedro Ramalhete
 * @author Andreia Correia
 */
template<typename T>
class CRWWPLinkedListSet : public crwwpstm::tmbase {

private:
    struct Node : public crwwpstm::tmbase {
        T key;
        crwwpstm::tmtype<Node*> next {nullptr};
        Node() {}
        Node(T key) : key{key} { }
    };

    alignas(128) crwwpstm::tmtype<Node*>  head {nullptr};
    alignas(128) crwwpstm::tmtype<Node*>  tail {nullptr};


public:
    CRWWPLinkedListSet(unsigned int maxThreads=0) {
        Node* lhead = new Node();
        Node* ltail = new Node();
        head = lhead;
        head->next = ltail;
        tail = ltail;
    }


    ~CRWWPLinkedListSet() {
        // Delete all the nodes in the list
        Node* prev = head;
        Node* node = prev->next;
        while (node != tail) {
            delete prev;
            prev = node;
            node = node->next;
        }
        delete prev;
        delete tail;
    }


    static std::string className() { return crwwpstm::CRWWPSTM::className() + "-LinkedListSet"; }


    /*
     * Progress Condition: blocking
     * Adds a node with a key, returns false if the key is already in the set
     */
    bool add(T key, const int tid=0) {
        return crwwpstm::updateTx<bool>([&] () -> bool {
                Node* newNode = crwwpstm::tmNew<Node>(key);
                Node* prev = head;
                Node* node = prev->next;
                while (true) {
                    if (node == tail) break;
                    if (key == node->key) {
                        crwwpstm::tmDelete(newNode); // If the key was already in the set, free the node that was never used
                        return false;
                    }
                    if (node->key < key) break;
                    prev = node;
                    node = node->next;
                }
                prev->next = newNode;
                newNode->next = node;
                return true;
            });
    }


    /*
     * Progress Condition: blocking
     * Removes a node with an key, returns false if the key is not in the set
     */
    bool remove(T key, const int tid=0) {
        return crwwpstm::updateTx<bool>([&] () -> bool {
                Node* prev = head;
                Node* node = prev->next;
                while (true) {
                    if (node == tail) return false;
                    if (key == node->key) {
                        prev->next = node->next;
                        crwwpstm::tmDelete(node);
                        return true;
                    }
                    if (node->key < key) return false;
                    prev = node;
                    node = node->next;
                }
            });
    }


    /*
     * Progress Condition: blocking
     * Returns true if it finds a node with a matching key
     */
    bool contains(T key, const int tid=0) {
        return crwwpstm::readTx<bool>([&] () -> bool {
                Node* node = head->next;
                while (true) {
                    if (node == tail) return false;
                    if (key == node->key) return true;
                    if (node->key < key) return false;
                    node = node->next;
                }
            });
    }


    bool addAll(T** keys, int size, const int tid) {
        for (int i = 0; i < size; i++) add(*keys[i], tid);
    }
};

#endif /* _C_RW_WP_STM_LINKED_LIST_SET_H_ */


================================================
FILE: datastructures/linkedlists/ESTMLinkedListSet.hpp
================================================
/*
 * Copyright 2017-2018
 *   Andreia Correia <andreia.veiga@unine.ch>
 *   Pedro Ramalhete <pramalhe@gmail.com>
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Nachshon Cohen <nachshonc@gmail.com>
 *
 * This work is published under the MIT license. See LICENSE.txt
 */
#ifndef _ESTM_LINKED_LIST_SET_H_
#define _ESTM_LINKED_LIST_SET_H_


#include "../../stms/ESTM.hpp"               // This header defines the macros for the STM being compiled


/**
 * <h1> A Linked List Set for Elastic STM </h1>
 * When we make the 'ltail' optimization here, it causes a crash on ESTM, therefore we don't do it.
 */
template<typename T>
class ESTMLinkedListSet : public estm::tmbase {

private:
    struct Node : public estm::tmbase {
        T key {};
        estm::tmtype<Node*> next {nullptr};
        Node() {}
        Node(T key) : key{key} { }
    };

    alignas(128) estm::tmtype<Node*>  head {nullptr};
    alignas(128) estm::tmtype<Node*>  tail {nullptr};


public:
    ESTMLinkedListSet(unsigned int maxThreads=0) {
        estm::updateTx([&] () {
            Node* lhead = estm::tmNew<Node>();
            Node* ltail = estm::tmNew<Node>();
            head = lhead;
            head->next = ltail;
            tail = ltail;
        });
    }


    ~ESTMLinkedListSet() {
        estm::updateTx([&] () {
            // Delete all the nodes in the list
            Node* prev = head;
            Node* node = prev->next;
            while (node != tail) {
                estm::tmDelete(prev);
                prev = node;
                node = node->next;
            }
            estm::tmDelete(prev);
            estm::tmDelete(tail.load());
        });
    }


    static std::string className() { return estm::ESTM::className() + "-LinkedListSet"; }


    /*
     * Progress Condition: blocking
     * Adds a node with a key, returns false if the key is already in the set
     */
    bool add(T key, const int tid=0) {
        return estm::updateTx<bool>([this,key] () {
                Node* newNode = estm::tmNew<Node>(key);
                Node* prev = head;
                Node* node = prev->next;
                while (true) {
                    if (node == tail) break;
                    if (key == node->key) {
                        estm::tmDelete(newNode); // If the key was already in the set, free the node that was never used
                        return false;
                    }
                    if (node->key < key) break;
                    prev = node;
                    node = node->next;
                }
                prev->next = newNode;
                newNode->next = node;
                return true;
            });
    }


    /*
     * Progress Condition: blocking
     * Removes a node with an key, returns false if the key is not in the set
     */
    bool remove(T key, const int tid=0) {
        return estm::updateTx<bool>([this,key] () {
                Node* prev = head;
                Node* node = prev->next;
                while (true) {
                    if (node == tail) return false;
                    if (key == node->key) {
                        prev->next = node->next;
                        estm::tmDelete(node);
                        return true;
                    }
                    if (node->key < key) return false;
                    prev = node;
                    node = node->next;
                }
            });
    }


    /*
     * Progress Condition: blocking
     * Returns true if it finds a node with a matching key
     */
    bool contains(T key, const int tid=0) {
        return estm::readTx<bool>([this,key] () {
            Node* node = head->next;
            while (true) {
                if (node == tail) return false;
                if (key == node->key) return true;
                if (node->key < key) return false;
                node = node->next;
            }
        });
    }


    bool addAll(T** keys, int size, const int tid) {
        for (int i = 0; i < size; i++) add(*keys[i], tid);
        return true;
    }
};

#endif /* _ESTM_LINKED_LIST_SET_H_ */


================================================
FILE: datastructures/linkedlists/HazardEras.hpp
================================================
/******************************************************************************
 * Copyright (c) 2016-2017, Pedro Ramalhete, Andreia Correia
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of Concurrency Freaks nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.

 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************************************************************
 */

#ifndef _HAZARD_ERAS_H_
#define _HAZARD_ERAS_H_

#include <atomic>
#include <iostream>
#include <vector>
#include <algorithm>

/*
 * <h1> Hazard Eras </h1>
 * This a light-weight implementation of hazard eras, where each thread has a
 * thread-local list of retired objects.
 *
 * This is based on the paper "Hazard Eras - Non-Blocking Memory Reclamation"
 * by Pedro Ramalhete and Andreia Correia:
 * https://github.com/pramalhe/ConcurrencyFreaks/blob/master/papers/hazarderas-2017.pdf
 *
 * The type T is for the objects/nodes and it's it must have the members newEra, delEra
 *
 * R is zero.
 *
 * <p>
 * @author Pedro Ramalhete
 * @author Andreia Correia
 */
template<typename T>
class HazardEras {

private:
    static const uint64_t NONE = 0;
    static const int      HE_MAX_THREADS = 128;
    static const int      MAX_HES = 5;        // This is named 'K' in the HP paper
    static const int      CLPAD = 128/sizeof(std::atomic<T*>);
    static const int      HE_THRESHOLD_R = 0; // This is named 'R' in the HP paper

    const int             maxHEs;
    const int             maxThreads;

    alignas(128) std::atomic<uint64_t>  eraClock {1};
    alignas(128) std::atomic<uint64_t>* he[HE_MAX_THREADS];
    // It's not nice that we have a lot of empty vectors, but we need padding to avoid false sharing
    alignas(128) std::vector<T*>        retiredList[HE_MAX_THREADS*CLPAD];

public:
    HazardEras(int maxHEs=MAX_HES, int maxThreads=HE_MAX_THREADS) : maxHEs{maxHEs}, maxThreads{maxThreads} {
        for (int it = 0; it < HE_MAX_THREADS; it++) {
            he[it] = new std::atomic<uint64_t>[CLPAD*2]; // We allocate four cache lines to allow for many hps and without false sharing
            retiredList[it*CLPAD].reserve(maxThreads*maxHEs);
            for (int ihe = 0; ihe < MAX_HES; ihe++) {
                he[it][ihe].store(NONE, std::memory_order_relaxed);
            }
        }
        static_assert(std::is_same<decltype(T::newEra), uint64_t>::value, "T::newEra must be uint64_t");
        static_assert(std::is_same<decltype(T::delEra), uint64_t>::value, "T::delEra must be uint64_t");
    }

    ~HazardEras() {
        for (int it = 0; it < HE_MAX_THREADS; it++) {
            delete[] he[it];
            // Clear the current retired nodes
            for (unsigned iret = 0; iret < retiredList[it*CLPAD].size(); iret++) {
                delete retiredList[it*CLPAD][iret];
            }
        }
    }


    inline uint64_t getEra() {
        return eraClock.load();
    }


    /**
     * Progress Condition: wait-free bounded (by maxHEs)
     */
    inline void clear(const int tid) {
        for (int ihe = 0; ihe < maxHEs; ihe++) {
            he[tid][ihe].store(NONE, std::memory_order_release);
        }
    }


    /**
     * Progress Condition: lock-free
     */
    inline T* get_protected(int index, const std::atomic<T*>& atom, const int tid) {
        auto prevEra = he[tid][index].load(std::memory_order_relaxed);
		while (true) {
		    T* ptr = atom.load();
		    auto era = eraClock.load(std::memory_order_acquire);
		    if (era == prevEra) return ptr;
            he[tid][index].store(era);
            prevEra = era;
		}
    }

    inline void protectEraRelease(int index, int other, const int tid) {
        auto era = he[tid][other].load(std::memory_order_relaxed);
        if (he[tid][index].load(std::memory_order_relaxed) == era) return;
        he[tid][index].store(era, std::memory_order_release);
    }


    /*
     * Does a single iteration. Must be integrated into the algorithm that's using HE.
     * In other words, we must re-check if era has changed
     *
     * Progress Condition: wait-free population oblivious
     */
    inline T* protectPtr(int index, const std::atomic<T*>& atom, uint64_t& prevEra, const int tid) {
        T* ptr = atom.load(std::memory_order_acquire);
        auto era = eraClock.load();
        if (prevEra != era) {
            prevEra = era;
            he[tid][index].store(era, std::memory_order_relaxed);
            std::atomic_thread_fence(std::memory_order_seq_cst);
        }
        return ptr;
    }


    /**
     * Retire an object (node)
     * Progress Condition: wait-free bounded
     *
     * Doing rlist.erase() is not the most efficient way to remove entries from a std::vector, but ok...
     */
    void retire(T* ptr, const int mytid) {
        auto currEra = eraClock.load();
        ptr->delEra = currEra;
        auto& rlist = retiredList[mytid*CLPAD];
        rlist.push_back(ptr);
        if (eraClock == currEra) eraClock.fetch_add(1);
        for (unsigned iret = 0; iret < rlist.size();) {
            auto obj = rlist[iret];
            if (canDelete(obj, mytid)) {
                rlist.erase(rlist.begin() + iret);
                delete obj;
                continue;
            }
            iret++;
        }
    }

private:
    bool canDelete(T* obj, const int mytid) {
        for (int tid = 0; tid < maxThreads; tid++) {
            for (int ihe = 0; ihe < maxHEs; ihe++) {
                const auto era = he[tid][ihe].load(std::memory_order_acquire);
                if (era == NONE || era < obj->newEra || era > obj->delEra) continue;
                return false;
            }
        }
        return true;
    }

};

#endif /* _HAZARD_ERAS_H_ */


================================================
FILE: datastructures/linkedlists/HazardPointers.hpp
================================================
/******************************************************************************
 * Copyright (c) 2014-2017, Pedro Ramalhete, Andreia Correia
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of Concurrency Freaks nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.

 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************************************************************
 */

#ifndef _HAZARD_POINTERS_H_
#define _HAZARD_POINTERS_H_

#include <atomic>
#include <iostream>
#include <vector>


template<typename T>
class HazardPointers {

private:
    static const int      HP_MAX_THREADS = 128;
    static const int      HP_MAX_HPS = 5;     // This is named 'K' in the HP paper
    static const int      CLPAD = 128/sizeof(std::atomic<T*>);
    static const int      HP_THRESHOLD_R = 0; // This is named 'R' in the HP paper
    static const int      MAX_RETIRED = HP_MAX_THREADS*HP_MAX_HPS; // Maximum number of retired objects per thread

    const int             maxHPs;
    const int             maxThreads;

    alignas(128) std::atomic<T*>*      hp[HP_MAX_THREADS];
    // It's not nice that we have a lot of empty vectors, but we need padding to avoid false sharing
    alignas(128) std::vector<T*>       retiredList[HP_MAX_THREADS*CLPAD];

public:
    HazardPointers(int maxHPs=HP_MAX_HPS, int maxThreads=HP_MAX_THREADS) : maxHPs{maxHPs}, maxThreads{maxThreads} {
        for (int it = 0; it < HP_MAX_THREADS; it++) {
            hp[it] = new std::atomic<T*>[CLPAD*2]; // We allocate four cache lines to allow for many hps and without false sharing
            retiredList[it*CLPAD].reserve(MAX_RETIRED);
            for (int ihp = 0; ihp < HP_MAX_HPS; ihp++) {
                hp[it][ihp].store(nullptr, std::memory_order_relaxed);
            }
        }
    }

    ~HazardPointers() {
        for (int it = 0; it < HP_MAX_THREADS; it++) {
            delete[] hp[it];
            // Clear the current retired nodes
            for (unsigned iret = 0; iret < retiredList[it*CLPAD].size(); iret++) {
                delete retiredList[it*CLPAD][iret];
            }
        }
    }


    /**
     * Progress Condition: wait-free bounded (by maxHPs)
     */
    inline void clear(const int tid) {
        for (int ihp = 0; ihp < maxHPs; ihp++) {
            hp[tid][ihp].store(nullptr, std::memory_order_release);
        }
    }


    /**
     * Progress Condition: wait-free population oblivious
     */
    inline void clearOne(int ihp, const int tid) {
        hp[tid][ihp].store(nullptr, std::memory_order_release);
    }


    /**
     * Progress Condition: lock-free
     */
    inline T* protect(int index, const std::atomic<T*>& atom, const int tid) {
        T* n = nullptr;
        T* ret;
		while ((ret = atom.load()) != n) {
			hp[tid][index].store(ret);
			n = ret;
		}
		return ret;
    }

	
    /**
     * This returns the same value that is passed as ptr, which is sometimes useful
     * Progress Condition: wait-free population oblivious
     */
    inline T* protectPtr(int index, T* ptr, const int tid) {
        hp[tid][index].store(ptr);
        /*
        // For x86-only implementations, use this instead (it's 2x faster than mfence on x86):
        hp[tid][index].store(ptr, std::memory_order_release);
        __asm__ __volatile__ ("lock;addl $0,(%%rsp);" ::: "cc","memory") ;
        */
        return ptr;
    }


    /**
     * This returns the same value that is passed as ptr, which is sometimes useful
     * Progress Condition: wait-free population oblivious
     */
    inline T* protectPtrRelease(int index, T* ptr, const int tid) {
        hp[tid][index].store(ptr, std::memory_order_release);
        return ptr;
    }


    /**
     * Progress Condition: wait-free bounded (by the number of threads squared)
     */
    void retire(T* ptr, const int tid) {
        retiredList[tid*CLPAD].push_back(ptr);
        if (retiredList[tid*CLPAD].size() < HP_THRESHOLD_R) return;
        for (unsigned iret = 0; iret < retiredList[tid*CLPAD].size();) {
            auto obj = retiredList[tid*CLPAD][iret];
            bool canDelete = true;
            for (int tid = 0; tid < maxThreads && canDelete; tid++) {
                for (int ihp = 0; ihp < maxHPs; ihp++) {
                    if (hp[tid][ihp].load() == obj) {
                        canDelete = false;
                        break;
                    }
                }
            }
            if (canDelete) {
                retiredList[tid*CLPAD].erase(retiredList[tid*CLPAD].begin() + iret);
                delete obj;
                continue;
            }
            iret++;
        }
    }
};

#endif /* _HAZARD_POINTERS_H_ */


================================================
FILE: datastructures/linkedlists/MagedHarrisLinkedListSetHE.hpp
================================================
/******************************************************************************
 * Copyright (c) 2014-2016, Pedro Ramalhete, Andreia Correia
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of Concurrency Freaks nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************************************************************
 */

#ifndef _MAGED_M_MICHAEL_LINKED_LIST_HE_H_
#define _MAGED_M_MICHAEL_LINKED_LIST_HE_H_

#include <atomic>
#include <thread>
#include <forward_list>
#include <set>
#include <iostream>
#include <string>
#include "common/HazardEras.hpp"


/**
 * This is the linked list by Maged M. Michael that uses Hazard Eras.
 * Lock-Free Linked List as described in Maged M. Michael paper (Figure 7):
 * http://www.cs.tau.ac.il/~afek/p73-Lock-Free-HashTbls-michael.pdf
 *
 * <p>
 * This set has three operations:
 * <ul>
 * <li>add(x)      - Lock-Free
 * <li>remove(x)   - Lock-Free
 * <li>contains(x) - Lock-Free
 * </ul><p>
 * <p>
 * @author Pedro Ramalhete
 * @author Andreia Correia
 */
template<typename T>
class MagedHarrisLinkedListSetHE {

private:
    struct Node {
        T key;
        uint64_t newEra;
        uint64_t delEra;
        std::atomic<Node*> next;

        Node(T key, uint64_t newEra) : key{key}, newEra{newEra}, delEra{0}, next{nullptr}  { }

        bool casNext(Node *cmp, Node *val) {
            return next.compare_exchange_strong(cmp, val);
        }
    };

    // Pointers to head and tail sentinel nodes of the list
    std::atomic<Node*> head;
    std::atomic<Node*> tail;

    const int maxThreads;

    HazardEras<Node> he {3, maxThreads};
    const int kHp0 = 0; // Protects next
    const int kHp1 = 1; // Protects curr
    const int kHp2 = 2; // Protects prev

public:

    MagedHarrisLinkedListSetHE(const int maxThreads) : maxThreads{maxThreads} {
        head.store(new Node({}, 1));  // Uses K's default constructor
        tail.store(new Node({}, 1));  // Uses K's default constructor
        head.load()->next.store(tail.load());
    }


    // We don't expect the destructor to be called if this instance can still be in use
    ~MagedHarrisLinkedListSetHE() {
        Node *prev = head.load();
        Node *node = prev->next.load();
        while (node != nullptr) {
            delete prev;
            prev = node;
            node = prev->next.load();
        }
        delete prev;
    }

    static std::string className() { return "MagedHarris-LinkedListSetHE"; }

    /*
     * This function is single threaded to be called at the start of the test.
     * It is assumed keys are ordered
     */
    void addAll(T** keys, const int size, const int tid) {
        Node* node = head;
        for(int i=0;i<size;i++){
            T* key = keys[i];
            Node* newNode = new Node(*key, 1);
            node->next.store(newNode, std::memory_order_relaxed);
            node = newNode;
        }
        node->next.store(tail.load(std::memory_order_relaxed), std::memory_order_relaxed);
    }

    /**
     * This method is named 'Insert()' in the original paper.
     * Taken from Figure 7 of the paper:
     * "High Performance Dynamic Lock-Free Hash Tables and List-Based Sets"
     * <p>
     * Progress Condition: Lock-Free
     *
     */
    bool add(T key, const int tid)
    {
        Node *curr, *next;
        std::atomic<Node*> *prev;
        Node* newNode = new Node(key, he.getEra());
        while (true) {
            if (find(&key, &prev, &curr, &next, tid)) {
                delete newNode;              // There is already a matching key
                he.clear(tid);
                return false;
            }
            newNode->next.store(curr, std::memory_order_relaxed);
            Node *tmp = getUnmarked(curr);
            if (prev->compare_exchange_strong(tmp, newNode)) { // seq-cst
                he.clear(tid);
                return true;
            }
        }
    }


    /**
     * This method is named 'Delete()' in the original paper.
     * Taken from Figure 7 of the paper:
     * "High Performance Dynamic Lock-Free Hash Tables and List-Based Sets"
     */
    bool remove(T key, const int tid)
    {
        Node *curr, *next;
        std::atomic<Node*> *prev;
        while (true) {
            /* Try to find the key in the list. */
            if (!find(&key, &prev, &curr, &next, tid)) {
                he.clear(tid);
                return false;
            }
            /* Mark if needed. */
            Node *tmp = getUnmarked(next);
            if (!curr->next.compare_exchange_strong(tmp, getMarked(next))) {
                continue; /* Another thread interfered. */
            }

            tmp = getUnmarked(curr);
            if (prev->compare_exchange_strong(tmp, getUnmarked(next))) { /* Unlink */
                he.clear(tid);
                he.retire(getUnmarked(curr), tid); /* Reclaim */
            } else {
                he.clear(tid);
            }
            /*
             * If we want to prevent the possibility of there being an
             * unbounded number of unmarked nodes, add "else _find(head,key)."
             * This is not necessary for correctness.
             */
            return true;
        }
    }


    /**
     * This is named 'Search()' on the original paper
     * Taken from Figure 7 of the paper:
     * "High Performance Dynamic Lock-Free Hash Tables and List-Based Sets"
     * <p>
     * Progress Condition: Lock-Free
     */
    bool contains(T key, const int tid)
    {
        Node *curr, *next;
        std::atomic<Node*> *prev;
        bool isContains = find(&key, &prev, &curr, &next, tid);
        he.clear(tid);
        return isContains;
    }


private:

    /**
     * <p>
     * Progress Condition: Lock-Free
     */
    bool find (T* key, std::atomic<Node*> **par_prev, Node **par_curr, Node **par_next, const int tid)
    {
        std::atomic<Node*> *prev;
        Node *curr, *next;

     try_again:
        prev = &head;
        // Protect curr with a hazard era
        curr = he.get_protected(kHp1, *prev, tid);
        while (true) {
            if (getUnmarked(curr) == nullptr) break; // TODO: Will it ever happen?
            // Protect next with a hazard era.
            next = he.get_protected(kHp0, curr->next, tid);
            if (getUnmarked(curr)->next.load() != next) goto try_again;
            if (getUnmarked(next) == tail.load()) break;
            if (prev->load() != getUnmarked(curr)) goto try_again;
            if (getUnmarked(next) == next) { // !cmark in the paper
                if (!(getUnmarked(curr)->key < *key)) { // Check for null to handle head and tail
                    *par_curr = curr;
                    *par_prev = prev;
                    *par_next = next;
                    return (getUnmarked(curr)->key == *key);
                }
                prev = &getUnmarked(curr)->next;
                he.protectEraRelease(kHp2, kHp1, tid);
            } else {
                // Update the link and retire the node.
                Node *tmp = getUnmarked(curr);
                if (!prev->compare_exchange_strong(tmp, getUnmarked(next))) {
                    goto try_again;
                }
                he.retire(getUnmarked(curr), tid);
            }
            curr = next;
            he.protectEraRelease(kHp1, kHp0, tid);
        }
        *par_curr = curr;
        *par_prev = prev;
        *par_next = next;
        return false;
    }

    bool isMarked(Node * node) {
    	return ((size_t) node & 0x1);
    }

    Node * getMarked(Node * node) {
    	return (Node*)((size_t) node | 0x1);
    }

    Node * getUnmarked(Node * node) {
    	return (Node*)((size_t) node & (~0x1));
    }
};

#endif /* _MAGED_M_MICHAEL_LINKED_LIST_HE_H_ */


================================================
FILE: datastructures/linkedlists/MagedHarrisLinkedListSetHP.hpp
================================================
/******************************************************************************
 * Copyright (c) 2014-2016, Pedro Ramalhete, Andreia Correia
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of Concurrency Freaks nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************************************************************
 */

#ifndef _MAGED_MICHAEL_TIM_HARRIS_LINKED_LIST_HP_H_
#define _MAGED_MICHAEL_TIM_HARRIS_LINKED_LIST_HP_H_

#include <atomic>
#include <thread>
#include <forward_list>
#include <set>
#include <iostream>
#include <string>
#include "common/HazardPointers.hpp"


/**
 * This is the linked list by Maged M. Michael that uses Hazard Pointers in
 * a correct way because Harris original algorithm with HPs doesn't.
 * Lock-Free Linked List as described in Maged M. Michael paper (Figure 4):
 * http://www.cs.tau.ac.il/~afek/p73-Lock-Free-HashTbls-michael.pdf
 *
 * 
 * <p>
 * This set has three operations:
 * <ul>
 * <li>add(x)      - Lock-Free
 * <li>remove(x)   - Lock-Free
 * <li>contains(x) - Lock-Free
 * </ul><p>
 * <p>
 * @author Pedro Ramalhete
 * @author Andreia Correia
 */
template<typename T> 
class MagedHarrisLinkedListSetHP {

private:
    struct Node {
        T key;
        std::atomic<Node*> next;

        Node(T key) : key{key}, next{nullptr} { }

        bool casNext(Node *cmp, Node *val) {
            return next.compare_exchange_strong(cmp, val);
        }
    };

    // Pointers to head and tail sentinel nodes of the list
    std::atomic<Node*> head;
    std::atomic<Node*> tail;

    const int maxThreads;

    // We need 3 hazard pointers
    HazardPointers<Node> hp {3, maxThreads};
    const int kHp0 = 0; // Protects next
    const int kHp1 = 1; // Protects curr
    const int kHp2 = 2; // Protects prev

public:

    MagedHarrisLinkedListSetHP(const int maxThreads) : maxThreads{maxThreads} {
        head.store(new Node({}));
        tail.store(new Node({}));
        head.load()->next.store(tail.load());
    }


    // We don't expect the destructor to be called if this instance can still be in use
    ~MagedHarrisLinkedListSetHP() {
        Node *prev = head.load();
        Node *node = prev->next.load();
        while (node != nullptr) {
            delete prev;
            prev = node;
            node = prev->next.load();
        }
        delete prev;
    }

    static std::string className() { return "MagedHarris-LinkedListSetHP"; }

    /*
     * This function is single threaded to be called at the start of the test.
     * It is assumed keys are ordered
     */
    void addAll(T** keys, const int size, const int tid) {
        Node* node = head;
        for(int i=0;i<size;i++){
            T* key = keys[i];
            Node* newNode = new Node(*key);
            node->next.store(newNode, std::memory_order_relaxed);
            node = newNode;
        }
        node->next.store(tail.load(std::memory_order_relaxed), std::memory_order_relaxed);
    }

    /**
     * This method is named 'Insert()' in the original paper.
     * Taken from Figure 7 of the paper:
     * "High Performance Dynamic Lock-Free Hash Tables and List-Based Sets"
     * <p>
     * Progress Condition: Lock-Free
     *
     */
    bool add(T key, const int tid)
    {
        Node *curr, *next;
        std::atomic<Node*> *prev;
        Node* newNode = new Node(key);
        while (true) {
            if (find(&key, &prev, &curr, &next, tid)) {
                delete newNode;              // There is already a matching key
                hp.clear(tid);
                return false;
            }
            newNode->next.store(curr, std::memory_order_relaxed);
            Node *tmp = getUnmarked(curr);
            if (prev->compare_exchange_strong(tmp, newNode)) { // seq-cst
                hp.clear(tid);
                return true;
            }
        }
    }


    /**
     * This method is named 'Delete()' in the original paper.
     * Taken from Figure 7 of the paper:
     * "High Performance Dynamic Lock-Free Hash Tables and List-Based Sets"
     */
    bool remove(T key, const int tid)
    {
        Node *curr, *next;
        std::atomic<Node*> *prev;
        while (true) {
            /* Try to find the key in the list. */
            if (!find(&key, &prev, &curr, &next, tid)) {
                hp.clear(tid);
                return false;
            }
            /* Mark if needed. */
            Node *tmp = getUnmarked(next);
            if (!curr->next.compare_exchange_strong(tmp, getMarked(next))) {
                continue; /* Another thread interfered. */
            }

            tmp = getUnmarked(curr);
            if (prev->compare_exchange_strong(tmp, getUnmarked(next))) { /* Unlink */
                hp.clear(tid);
                hp.retire(getUnmarked(curr), tid); /* Reclaim */
            } else {
                hp.clear(tid);
            }
            /*
             * If we want to prevent the possibility of there being an
             * unbounded number of unmarked nodes, add "else _find(head,key)."
             * This is not necessary for correctness.
             */
            return true;
        }
    }


    /**
     * This is named 'Search()' on the original paper
     * Taken from Figure 7 of the paper:
     * "High Performance Dynamic Lock-Free Hash Tables and List-Based Sets"
     * <p>
     * Progress Condition: Lock-Free
     */
    bool contains(T key, const int tid)
    {
        Node *curr, *next;
        std::atomic<Node*> *prev;
        bool isContains = find(&key, &prev, &curr, &next, tid);
        hp.clear(tid);
        return isContains;
    }


private:

    /**
     * TODO: This needs to be code reviewed... it's not production-ready
     * <p>
     * Progress Condition: Lock-Free
     */
    bool find (T* key, std::atomic<Node*> **par_prev, Node **par_curr, Node **par_next, const int tid)
    {
        std::atomic<Node*> *prev;
        Node *curr, *next;

     try_again:
        prev = &head;
        curr = prev->load();
        // Protect curr with a hazard pointer.
        hp.protectPtr(kHp1, curr, tid);
        if (prev->load() != getUnmarked(curr)) goto try_again;
        while (true) {
            if (getUnmarked(curr) == nullptr) break;
            // Protect next with a hazard pointer.
            next = curr->next.load();
            hp.protectPtr(kHp0, getUnmarked(next), tid);
            if (getUnmarked(curr)->next.load() != next) goto try_again;
            if (getUnmarked(next) == tail.load()) break;
            if (prev->load() != getUnmarked(curr)) goto try_again;
            if (getUnmarked(next) == next) { // !cmark in the paper
                if (!(getUnmarked(curr)->key < *key)) { // Check for null to handle head and tail
                    *par_curr = curr;
                    *par_prev = prev;
                    *par_next = next;
                    return (getUnmarked(curr)->key == *key);
                }
                prev = &getUnmarked(curr)->next;
                hp.protectRelease(kHp2, getUnmarked(curr), tid);
            } else {
                // Update the link and retire the node.
                Node *tmp = getUnmarked(curr);
                if (!prev->compare_exchange_strong(tmp, getUnmarked(next))) {
                    goto try_again;
                }
                hp.retire(getUnmarked(curr), tid);
            }
            curr = next;
            hp.protectRelease(kHp1, getUnmarked(next), tid);
        }
        *par_curr = curr;
        *par_prev = prev;
        *par_next = next;
        return false;
    }

    bool isMarked(Node * node) {
    	return ((size_t) node & 0x1);
    }

    Node * getMarked(Node * node) {
    	return (Node*)((size_t) node | 0x1);
    }

    Node * getUnmarked(Node * node) {
    	return (Node*)((size_t) node & (~0x1));
    }
};

#endif /* _MAGED_MICHAEL_TIM_HARRIS_LINKED_LIST_HP_H_ */


================================================
FILE: datastructures/linkedlists/OFLFLinkedListSet.hpp
================================================
/*
 * Copyright 2017-2018
 *   Andreia Correia <andreia.veiga@unine.ch>
 *   Pedro Ramalhete <pramalhe@gmail.com>
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Nachshon Cohen <nachshonc@gmail.com>
 *
 * This work is published under the MIT license. See LICENSE.txt
 */
#ifndef _ONEFILE_LF_LINKED_LIST_SET_H_
#define _ONEFILE_LF_LINKED_LIST_SET_H_

#include <atomic>
#include <stdexcept>
#include "stms/OneFileLF.hpp"


/**
 * <h1> A Linked List Set for One-File STM (Lock-Free) </h1>
 */
template<typename T>
class OFLFLinkedListSet : public oflf::tmbase {

private:
    struct Node : public oflf::tmbase {
        T key {};
        oflf::tmtype<Node*> next {nullptr};
        Node() {}
        Node(T key) : key{key} { }
    };

    alignas(128) oflf::tmtype<Node*>  head {nullptr};
    alignas(128) oflf::tmtype<Node*>  tail {nullptr};


public:
    OFLFLinkedListSet(unsigned int maxThreads=0) {
        oflf::updateTx([this] () {
            Node* lhead = oflf::tmNew<Node>();
            Node* ltail = oflf::tmNew<Node>();
            head = lhead;
            head->next = ltail;
            tail = ltail;
        });
    }


    ~OFLFLinkedListSet() {
        oflf::updateTx([this] () {
            // Delete all the nodes in the list
            Node* prev = head;
            Node* node = prev->next;
            while (node != tail) {
                oflf::tmDelete(prev);
                prev = node;
                node = node->next;
            }
            oflf::tmDelete(prev);
            oflf::tmDelete(tail.pload());
        });
    }


    static std::string className() { return oflf::OneFileLF::className() + "-LinkedListSet"; }


    /*
     * Progress Condition: lock-free
     * Adds a node with a key, returns false if the key is already in the set
     */
    bool add(T key, const int tid=0) {
        return oflf::updateTx<bool>([this,key] () -> bool {
            Node* newNode = oflf::tmNew<Node>(key);
            Node* prev = head;
            Node* node = prev->next;
            Node* ltail = tail;
            while (true) {
                if (node == ltail) break;
                T nkey = node->key;
                if (key == nkey) {
                    oflf::tmDelete(newNode); // If the key was already in the set, free the node that was never used
                    return false;
                }
                if (nkey < key) break;
                prev = node;
                node = node->next;
            }
            prev->next = newNode;
            newNode->next = node;
            return true;
        });
    }


    /*
     * Progress Condition: lock-free
     * Removes a node with an key, returns false if the key is not in the set
     */
    bool remove(T key, const int tid=0) {
        return oflf::updateTx<bool>([this,key] () -> bool {
            Node* prev = head;
            Node* node = prev->next;
            Node* ltail = tail;
            while (true) {
                if (node == ltail) return false;
                T nkey = node->key;
                if (key == nkey) {
                    prev->next = node->next;
                    oflf::tmDelete(node);
                    return true;
                }
                if (nkey < key) return false;
                prev = node;
                node = node->next;
            }
        });
    }


    /*
     * Progress Condition: lock-free
     * Returns true if it finds a node with a matching key
     */
    bool contains(T key, const int tid=0) {
        return oflf::readTx<bool>([this,key] () -> bool {
            Node* node = head->next;
            Node* ltail = tail;
            while (true) {
                if (node == ltail) return false;
                T nkey = node->key;
                if (key == nkey) return true;
                if (nkey < key) return false;
                node = node->next;
            }
        });
    }


    bool addAll(T** keys, int size, const int tid) {
        for (int i = 0; i < size; i++) add(*keys[i], tid);
        return true;
    }
};

#endif /* _ONE_FILE_LF_LINKED_LIST_SET_H_ */


================================================
FILE: datastructures/linkedlists/OFWFLinkedListSet.hpp
================================================
 /*
 * Copyright 2017-2018
 *   Andreia Correia <andreia.veiga@unine.ch>
 *   Pedro Ramalhete <pramalhe@gmail.com>
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Nachshon Cohen <nachshonc@gmail.com>
 *
 * This work is published under the MIT license. See LICENSE.txt
 */
#ifndef _ONEFILE_WF_LINKED_LIST_SET_H_
#define _ONEFILE_WF_LINKED_LIST_SET_H_

#include <atomic>
#include <stdexcept>
#include "stms/OneFileWF.hpp"


/**
 * <h1> A Linked List Set for One-File STM (wait-Free) </h1>
 */
template<typename T>
class OFWFLinkedListSet : public ofwf::tmbase {

private:
    struct Node : public ofwf::tmbase {
        T key {};
        ofwf::tmtype<Node*> next {nullptr};
        Node(T key) : key{key} { }
        Node() {}
    };

    alignas(128) ofwf::tmtype<Node*>  head {nullptr};
    alignas(128) ofwf::tmtype<Node*>  tail {nullptr};


public:
    OFWFLinkedListSet(unsigned int maxThreads=0) {
        ofwf::updateTx([this] () {
            Node* lhead = ofwf::tmNew<Node>();
            Node* ltail = ofwf::tmNew<Node>();
            head = lhead;
            head->next = ltail;
            tail = ltail;
        });
    }


    ~OFWFLinkedListSet() {
        ofwf::updateTx([this] () {
            // Delete all the nodes in the list
            Node* prev = head;
            Node* node = prev->next;
            while (node != tail) {
                ofwf::tmDelete(prev);
                prev = node;
                node = node->next;
            }
            ofwf::tmDelete(prev);
            ofwf::tmDelete(tail.pload());
        });
    }


    static std::string className() { return ofwf::OneFileWF::className() + "-LinkedListSet"; }


    /*
     * Progress Condition: wait-free
     * Adds a node with a key, returns false if the key is already in the set
     */
    bool add(T key, const int tid=0) {
        return ofwf::updateTx<bool>([this,key] () {
            Node* newNode = ofwf::tmNew<Node>(key);
            Node* prev = head;
            Node* node = prev->next;
            Node* ltail = tail;
            while (true) {
                if (node == ltail) break;
                T nkey = node->key;
                if (key == nkey) {
                    ofwf::tmDelete(newNode); // If the key was already in the set, free the node that was never used
                    return false;
                }
                if (nkey < key) break;
                prev = node;
                node = node->next;
            }
            prev->next = newNode;
            newNode->next = node;
            return true;
        });
    }


    /*
     * Progress Condition: wait-free
     * Removes a node with an key, returns false if the key is not in the set
     */
    bool remove(T key, const int tid=0) {
        return ofwf::updateTx<bool>([this,key] () {
            Node* prev = head;
            Node* node = prev->next;
            Node* ltail = tail;
            while (true) {
                if (node == ltail) return false;
                T nkey = node->key;
                if (key == nkey) {
                    prev->next = node->next;
                    ofwf::tmDelete(node);
                    return true;
                }
                if (nkey < key) return false;
                prev = node;
                node = node->next;
            }
        });
    }


    /*
     * Progress Condition: wait-free
     * Returns true if it finds a node with a matching key
     */
    bool contains(T key, const int tid=0) {
        return ofwf::readTx<bool>([this,key] () {
            Node* node = head->next;
            Node* ltail = tail;
            while (true) {
                if (node == ltail) return false;
                T nkey = node->key;
                if (key == nkey) return true;
                if (nkey < key) return false;
                node = node->next;
            }
        });
    }


    bool addAll(T** keys, int size, const int tid) {
        for (int i = 0; i < size; i++) add(*keys[i], tid);
        return true;
    }
};

#endif /* _ONE_FILE_WF_LINKED_LIST_SET_H_ */


================================================
FILE: datastructures/linkedlists/STMLinkedListSet.hpp
================================================
/******************************************************************************
 * Copyright (c) 2014-2018, Pedro Ramalhete, Andreia Correia
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of Concurrency Freaks nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************************************************************
 */

#ifndef _STM_LINKED_LIST_SET_H_
#define _STM_LINKED_LIST_SET_H_

#include <atomic>
#include <stdexcept>
//#include "stms/OneFileLF.hpp"


/**
 * <h1> A Linked List Set for an STM </h1>
 *
 * TODO
 *
 *
 * @author Pedro Ramalhete
 * @author Andreia Correia
 */
template<typename T, typename TM, template<typename> class TMTYPE, template<typename> class TMBASE>
class STMLinkedListSet : public TMBASE {

private:
    struct Node : public TMBASE {
        T* key;
        TMTYPE<Node*> next;
        Node(T* key) : key{key}, next{nullptr} { }
    };

    alignas(128) TMTYPE<Node*>  head {nullptr};
    alignas(128) TMTYPE<Node*>  tail {nullptr};


public:
    STMLinkedListSet(unsigned int maxThreads=0) {
        Node* lhead = new Node(nullptr);
        Node* ltail = new Node(nullptr);
        head = lhead;
        head->next = ltail;
        tail = ltail;
    }


    ~STMLinkedListSet() {
        // Delete all the nodes in the list
        Node* prev = head;
        Node* node = prev->next;
        while (node != tail) {
            delete prev;
            prev = node;
            node = node->next;
        }
        delete prev;
        delete tail;
    }


    static std::string className() { return TM::className() + "-LinkedListSet"; }


    /*
     * Progress Condition: lock-free
     * Adds a node with a key, returns false if the key is already in the set
     */
    bool add(T* key, const int tid=0) {
        if (key == nullptr) throw std::invalid_argument("key can not be nullptr");
        return TM::updateTx<bool>([this,key] () -> bool {
                Node* newNode = TM::tmNew<Node>(key);
                Node* prev = head;
                Node* node = prev->next;
                while (true) {
                    if (node == tail) break;
                    if (*key == *node->key) {
                        TM::tmDelete(newNode); // If the key was already in the set, free the node that was never used
                        return false;
                    }
                    if (*(node->key) < *key) break;
                    prev = node;
                    node = node->next;
                }
                prev->next = newNode;
                newNode->next = node;
                return true;
            });
    }


    /*
     * Progress Condition: lock-free
     * Removes a node with an key, returns false if the key is not in the set
     */
    bool remove(T* key, const int tid=0) {
        if (key == nullptr) throw std::invalid_argument("key can not be nullptr");
        return TM::updateTx<bool>([this,key] () -> bool {
                Node* prev = head;
                Node* node = prev->next;
                while (true) {
                    if (node == tail) return false;
                    if (*key == *node->key) {
                        prev->next = node->next;
                        TM::tmDelete(node);
                        return true;
                    }
                    if (*(node->key) < *key) return false;
                    prev = node;
                    node = node->next;
                }
            });
    }


    /*
     * Progress Condition: lock-free
     * Returns true if it finds a node with a matching key
     */
    bool contains(T* key, const int tid=0) {
        if (key == nullptr) throw std::invalid_argument("key can not be nullptr");
        return TM::readTx<bool>([this,key] () -> bool {
                Node* node = head->next;
                while (true) {
                    if (node == tail) return false;
                    if (*key == *node->key) return true;
                    if (*(node->key) < *key) return false;
                    node = node->next;
                }
            });
    }


    bool addAll(T** keys, int size, const int tid) {
        for (int i = 0; i < size; i++) add(keys[i], tid);
    }
};

#endif /* _ONE_FILE_LF_LINKED_LIST_SET_H_ */


================================================
FILE: datastructures/linkedlists/TinySTMLinkedListSet.hpp
================================================
/*
 * Copyright 2017-2018
 *   Andreia Correia <andreia.veiga@unine.ch>
 *   Pedro Ramalhete <pramalhe@gmail.com>
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Nachshon Cohen <nachshonc@gmail.com>
 *
 * This work is published under the MIT license. See LICENSE.txt
 */
#ifndef _TINY_STM_LINKED_LIST_SET_H_
#define _TINY_STM_LINKED_LIST_SET_H_

#include "stms/TinySTM.hpp"


/**
 * <h1> A Linked List Set for usage with TinySTM </h1>
 */
template<typename T>
class TinySTMLinkedListSet : public tinystm::tmbase {

private:
    struct Node : public tinystm::tmbase {
        T key;
        tinystm::tmtype<Node*> next{nullptr};
        Node() {}
        Node(T key) : key{key} { }
    };

    alignas(128) tinystm::tmtype<Node*>  head {nullptr};
    alignas(128) tinystm::tmtype<Node*>  tail {nullptr};


public:
    TinySTMLinkedListSet(unsigned int maxThreads=0) {
        tinystm::updateTx<bool>([this] () {
            Node* lhead = tinystm::tmNew<Node>();
            Node* ltail = tinystm::tmNew<Node>();
            head = lhead;
            head->next = ltail;
            tail = ltail;
            return true;
        });
    }


    ~TinySTMLinkedListSet() {
        tinystm::updateTx<bool>([this] () {
            // Delete all the nodes in the list
            Node* prev = head;
            Node* node = prev->next;
            while (node != tail) {
                tinystm::tmDelete(prev);
                prev = node;
                node = node->next;
            }
            tinystm::tmDelete(prev);
            tinystm::tmDelete(tail.load());
            return true;
        });
    }


    static std::string className() { return "TinySTM-LinkedListSet"; }


    /*
     * Progress Condition: blocking
     * Adds a node with a key, returns false if the key is already in the set
     */
    bool add(T key, const int tid=0) {
        return tinystm::updateTx<bool>([this,key] () {
                Node* newNode = tinystm::tmNew<Node>(key);
                Node* prev = head;
                Node* node = prev->next;
                Node* ltail = tail;
                while (true) {
                    if (node == ltail) break;
                    T nkey = node->key;
                    if (key == nkey) {
                        tinystm::tmDelete(newNode); // If the key was already in the set, free the node that was never used
                        return false;
                    }
                    if (nkey < key) break;
                    prev = node;
                    node = node->next;
                }
                prev->next = newNode;
                newNode->next = node;
                return true;
            });
    }


    /*
     * Progress Condition: blocking
     * Removes a node with an key, returns false if the key is not in the set
     */
    bool remove(T key, const int tid=0) {
        return tinystm::updateTx<bool>([this,key] () {
                Node* prev = head;
                Node* node = prev->next;
                Node* ltail = tail;
                while (true) {
                    if (node == ltail) return false;
                    T nkey = node->key;
                    if (key == nkey) {
                        prev->next = node->next;
                        tinystm::tmDelete(node);
                        return true;
                    }
                    if (nkey < key) return false;
                    prev = node;
                    node = node->next;
                }
            });
    }


    /*
     * Progress Condition: blocking
     * Returns true if it finds a node with a matching key
     */
    bool contains(T key, const int tid=0) {
        return tinystm::readTx<bool>([this,key] () {
                Node* node = head->next;
                Node* ltail = tail;
                while (true) {
                    if (node == ltail) return false;
                    T nkey = node->key;
                    if (key == nkey) return true;
                    if (nkey < key) return false;
                    node = node->next;
                }
            });
    }


    bool addAll(T** keys, int size, const int tid) {
        for (int i = 0; i < size; i++) add(*keys[i], tid);
    }
};

#endif /* _TINY_STM_LINKED_LIST_SET_H_ */


================================================
FILE: datastructures/queues/CRWWPLinkedListQueue.hpp
================================================
/******************************************************************************
 * Copyright (c) 2014-2017, Pedro Ramalhete, Andreia Correia
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of Concurrency Freaks nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************************************************************
 */

#ifndef _CRWWP_LINKED_LIST_QUEUE_H_
#define _CRWWP_LINKED_LIST_QUEUE_H_

#include <atomic>
#include <stdexcept>

#include "../../stms/CRWWPSTM.hpp"

/**
 * <h1> A Linked List queue using C-RW-WP STM </h1>
 *
 *
 * TODO
 *
 *
 * enqueue algorithm: sequential implementation + MWC
 * dequeue algorithm: sequential implementation + MWC
 * Consistency: Linearizable
 * enqueue() progress: lock-free
 * dequeue() progress: lock-free
 * Memory Reclamation: Hazard Eras (integrated into MWC)
 * enqueue min ops: 2 DCAS + 1 CAS
 * dequeue min ops: 1 DCAS + 1 CAS
 *
 * @author Pedro Ramalhete
 * @author Andreia Correia
 */
template<typename T>
class CRWWPLinkedListQueue {

private:
    struct Node : crwwpstm::tmbase {
        T* item;
        crwwpstm::tmtype<Node*> next;
        Node(T* userItem) : item{userItem}, next{nullptr} { }
    };

    alignas(128) crwwpstm::tmtype<Node*>  head {nullptr};
    alignas(128) crwwpstm::tmtype<Node*>  tail {nullptr};


public:
    CRWWPLinkedListQueue(unsigned int maxThreads=0) {
        Node* sentinelNode = new Node(nullptr);
        head = sentinelNode;
        tail = sentinelNode;
    }


    ~CRWWPLinkedListQueue() {
        while (dequeue() != nullptr); // Drain the queue
        Node* lhead = head;
        delete lhead;
    }


    static std::string className() { return "CRWWP-LinkedListQueue"; }


    /*
     * Progress Condition: lock-free
     * Always returns true
     */
    bool enqueue(T* item, const int tid=0) {
        if (item == nullptr) throw std::invalid_argument("item can not be nullptr");
        Node* newNode = crwwpstm::tmNew<Node>(item); // Let's allocate outside the transaction, less overhead
        return crwwpstm::updateTx<bool>([this,&newNode] () -> bool {
            tail->next = newNode;
            tail = newNode;
            return true;
        });
    }


    /*
     * Progress Condition: lock-free
     */
    T* dequeue(const int tid=0) {
        return crwwpstm::updateTx<T*>([this] () -> T* {
            Node* lhead = head;
            if (lhead == tail) return nullptr;
            head = lhead->next;
            crwwpstm::tmDelete(lhead);
            return head->item;
        });
    }
};

#endif /* _CRWWP_TM_LINKED_LIST_QUEUE_H_ */


================================================
FILE: datastructures/queues/ESTMArrayLinkedListQueue.hpp
================================================
/******************************************************************************
 * Copyright (c) 2014-2017, Pedro Ramalhete, Andreia Correia
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of Concurrency Freaks nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************************************************************
 */

#ifndef _ELASTIC_STM_ARRAY_LINKED_LIST_QUEUE_H_
#define _ELASTIC_STM_ARRAY_LINKED_LIST_QUEUE_H_

#include <atomic>
#include <stdexcept>

#include "stms/ESTM.hpp"

/**
 * <h1> An Array Linked List Queue using OneFile STM (Wait-Free) </h1>
 *
 * TODO
 *
 *
 * enqueue algorithm: sequential implementation + MWC
 * dequeue algorithm: sequential implementation + MWC
 * Consistency: Linearizable
 * enqueue() progress: lock-free
 * dequeue() progress: lock-free
 * Memory Reclamation: Hazard Eras (integrated into MWC)
 * enqueue min ops: 2 DCAS + 1 CAS
 * dequeue min ops: 1 DCAS + 1 CAS
 *
 * @author Pedro Ramalhete
 * @author Andreia Correia
 */
template<typename T>
class ESTMArrayLinkedListQueue {

private:
    struct Node : estm::tmbase {
        static const int ITEM_NUM = 1024;
        estm::tmtype<uint64_t> headidx {0};
        estm::tmtype<T*>       items[ITEM_NUM];
        estm::tmtype<uint64_t> tailidx {0};
        estm::tmtype<Node*>    next {nullptr};
        Node(T* item) {
            items[0] = item;
            tailidx = 1;
            headidx = 0;
            for (int i = 1; i < ITEM_NUM; i++) items[i] = nullptr;
        }
    };

    alignas(128) estm::tmtype<Node*>  head {nullptr};
    alignas(128) estm::tmtype<Node*>  tail {nullptr};


public:
    ESTMArrayLinkedListQueue(unsigned int maxThreads=0) {
        estm::updateTx<bool>([this] () {
            Node* sentinelNode = estm::tmNew<Node>(nullptr);
            sentinelNode->tailidx = 0;
            head = sentinelNode;
            tail = sentinelNode;
            return true;
        });
    }


    ~ESTMArrayLinkedListQueue() {
        while (dequeue() != nullptr); // Drain the queue
        estm::updateTx<bool>([this] () {
            Node* lhead = head;
            estm::tmDelete(lhead);
            return true;
        });
    }


    static std::string className() { return "ESTM-ArrayLinkedListQueue"; }


    /*
     * Progress Condition: blocking
     * Always returns true
     */
    bool enqueue(T* item, const int tid=0) {
        if (item == nullptr) throw std::invalid_argument("item can not be nullptr");
        return estm::updateTx<bool>([this,item] () -> bool {
            Node* ltail = tail;
            uint64_t ltailidx = ltail->tailidx;
            if (ltailidx < Node::ITEM_NUM) {
                ltail->items[ltailidx] = item;
                ++ltail->tailidx;
                return true;
            }
            Node* newNode = estm::tmNew<Node>(item);
            tail->next = newNode;
            tail = newNode;
            return true;
        });
    }


    /*
     * Progress Condition: blocking
     */
    T* dequeue(const int tid=0) {
        return estm::updateTx<T*>([this] () -> T* {
            Node* lhead = head;
            uint64_t lheadidx = lhead->headidx;
            // Check if queue is empty
            if (lhead == tail && lheadidx == tail->tailidx) return nullptr;
            if (lheadidx < Node::ITEM_NUM) {
                ++lhead->headidx;
                return lhead->items[lheadidx];
            }
            lhead = lhead->next;
            estm::tmDelete(head.load());
            head = lhead;
            ++lhead->headidx;
            return lhead->items[0];
        });
    }
};

#endif /* _OF_WF_ARRAY_LINKED_LIST_QUEUE_H_ */


================================================
FILE: datastructures/queues/ESTMLinkedListQueue.hpp
================================================
/******************************************************************************
 * Copyright (c) 2014-2018, Pedro Ramalhete, Andreia Correia
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of Concurrency Freaks nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************************************************************
 */

#ifndef _ELASTIC_STM_LINKED_LIST_QUEUE_H_
#define _ELASTIC_STM_LINKED_LIST_QUEUE_H_

#include <atomic>
#include <stdexcept>

#include "stms/ESTM.hpp"

/**
 * <h1> A Linked List queue using Elastic STM (blocking) </h1>
 *
 *
 * TODO
 *
 * @author Pedro Ramalhete
 * @author Andreia Correia
 */
template<typename T>
class ESTMLinkedListQueue {

private:
    struct Node : estm::tmbase {
        T* item;
        estm::tmtype<Node*> next {nullptr};
        Node(T* userItem) : item{userItem} { }
    };

    alignas(128) estm::tmtype<Node*>  head {nullptr};
    alignas(128) estm::tmtype<Node*>  tail {nullptr};


public:
    ESTMLinkedListQueue(unsigned int maxThreads=0) {
        estm::updateTx<bool>([this] () {
            Node* sentinelNode = estm::tmNew<Node>(nullptr);
            head = sentinelNode;
            tail = sentinelNode;
            return true;
        });
    }


    ~ESTMLinkedListQueue() {
        while (dequeue() != nullptr); // Drain the queue
        estm::updateTx<bool>([this] () {
            Node* lhead = head;
            estm::tmDelete(lhead);
            return true;
        });
    }


    static std::string className() { return "ESTM-LinkedListQueue"; }


    /*
     * Progress Condition: blocking
     * Always returns true
     */
    bool enqueue(T* item, const int tid=0) {
        if (item == nullptr) throw std::invalid_argument("item can not be nullptr");
        return estm::updateTx<bool>([this,item] () -> bool {
            Node* newNode = estm::tmNew<Node>(item);
            tail->next = newNode;
            tail = newNode;
            return true;
        });
    }


    /*
     * Progress Condition: blocking
     */
    T* dequeue(const int tid=0) {
        return estm::updateTx<T*>([this] () -> T* {
            Node* lhead = head;
            if (lhead == tail) return nullptr;
            head = lhead->next;
            estm::tmDelete(lhead);
            return head->item;
        });
    }
};

#endif /* _ESTM_LINKED_LIST_QUEUE_H_ */


================================================
FILE: datastructures/queues/FAAArrayQueue.hpp
================================================
/******************************************************************************
 * Copyright (c) 2014-2016, Pedro Ramalhete, Andreia Correia
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of Concurrency Freaks nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************************************************************
 */

#ifndef _FAA_ARRAY_QUEUE_HP_H_
#define _FAA_ARRAY_QUEUE_HP_H_

#include <atomic>
#include <stdexcept>
#include "HazardPointers.hpp"


/**
 * <h1> Fetch-And-Add Array Queue </h1>
 *
 * Each node has one array but we don't search for a vacant entry. Instead, we
 * use FAA to obtain an index in the array, for enqueueing or dequeuing.
 *
 * There are some similarities between this queue and the basic queue in YMC:
 * http://chaoran.me/assets/pdf/wfq-ppopp16.pdf
 * but it's not the same because the queue in listing 1 is obstruction-free, while
 * our algorithm is lock-free.
 * In FAAArrayQueue eventually a new node will be inserted (using Michael-Scott's
 * algorithm) and it will have an item pre-filled in the first position, which means
 * that at most, after BUFFER_SIZE steps, one item will be enqueued (and it can then
 * be dequeued). This kind of progress is lock-free.
 *
 * Each entry in the array may contain one of three possible values:
 * - A valid item that has been enqueued;
 * - nullptr, which means no item has yet been enqueued in that position;
 * - taken, a special value that means there was an item but it has been dequeued;
 *
 * Enqueue algorithm: FAA + CAS(null,item)
 * Dequeue algorithm: FAA + CAS(item,taken)
 * Consistency: Linearizable
 * enqueue() progress: lock-free
 * dequeue() progress: lock-free
 * Memory Reclamation: Hazard Pointers (lock-free)
 * Uncontended enqueue: 1 FAA + 1 CAS + 1 HP
 * Uncontended dequeue: 1 FAA + 1 CAS + 1 HP
 *
 *
 * <p>
 * Lock-Free Linked List as described in Maged Michael and Michael Scott's paper:
 * {@link http://www.cs.rochester.edu/~scott/papers/1996_PODC_queues.pdf}
 * <a href="http://www.cs.rochester.edu/~scott/papers/1996_PODC_queues.pdf">
 * Simple, Fast, and Practical Non-Blocking and Blocking Concurrent Queue Algorithms</a>
 * <p>
 * The paper on Hazard Pointers is named "Hazard Pointers: Safe Memory
 * Reclamation for Lock-Free objects" and it is available here:
 * http://web.cecs.pdx.edu/~walpole/class/cs510/papers/11.pdf
 *
 * @author Pedro Ramalhete
 * @author Andreia Correia
 */
template<typename T>
class FAAArrayQueue {
    static const long BUFFER_SIZE = 1024;  // 1024

private:
    struct Node {
        alignas(128) std::atomic<int>   deqidx;
        alignas(128) std::atomic<T*>    items[BUFFER_SIZE];
        alignas(128) std::atomic<int>   enqidx;
        alignas(128) std::atomic<Node*> next;

        // Start with the first entry pre-filled and enqidx at 1
        Node(T* item) : deqidx{0}, enqidx{1}, next{nullptr} {
            items[0].store(item, std::memory_order_relaxed);
            for (long i = 1; i < BUFFER_SIZE; i++) {
                items[i].store(nullptr, std::memory_order_relaxed);
            }
        }

        bool casNext(Node *cmp, Node *val) {
            return next.compare_exchange_strong(cmp, val);
        }
    };

    bool casTail(Node *cmp, Node *val) {
		return tail.compare_exchange_strong(cmp, val);
	}

    bool casHead(Node *cmp, Node *val) {
        return head.compare_exchange_strong(cmp, val);
    }

    // Pointers to head and tail of the list
    alignas(128) std::atomic<Node*> head;
    alignas(128) std::atomic<Node*> tail;

    static const int MAX_THREADS = 128;
    const int maxThreads;

    T* taken = (T*)new int();  // Muuuahahah !

    // We need just one hazard pointer
    HazardPointers<Node> hp {1, maxThreads};
    const int kHpTail = 0;
    const int kHpHead = 0;


public:
    FAAArrayQueue(int maxThreads=MAX_THREADS) : maxThreads{maxThreads} {
        Node* sentinelNode = new Node(nullptr);
        sentinelNode->enqidx.store(0, std::memory_order_relaxed);
        head.store(sentinelNode, std::memory_order_relaxed);
        tail.store(sentinelNode, std::memory_order_relaxed);
    }


    ~FAAArrayQueue() {
        while (dequeue(0) != nullptr); // Drain the queue
        delete head.load();            // Delete the last node
        delete (int*)taken;
    }


    static std::string className() { return "FAAArrayQueue"; }


    void enqueue(T* item, const int tid) {
        if (item == nullptr) throw std::invalid_argument("item can not be nullptr");
        while (true) {
            Node* ltail = hp.protect(kHpTail, tail, tid);
            const int idx = ltail->enqidx.fetch_add(1);
            if (idx > BUFFER_SIZE-1) { // This node is full
                if (ltail != tail.load()) continue;
                Node* lnext = ltail->next.load();
                if (lnext == nullptr) {
                    Node* newNode = new Node(item);
                    if (ltail->casNext(nullptr, newNode)) {
                        casTail(ltail, newNode);
                        hp.clear(tid);
                        return;
                    }
                    delete newNode;
                } else {
                    casTail(ltail, lnext);
                }
                continue;
            }
            T* itemnull = nullptr;
            if (ltail->items[idx].compare_exchange_strong(itemnull, item)) {
                hp.clear(tid);
                return;
            }
        }
    }


    T* dequeue(const int tid) {
        while (true) {
            Node* lhead = hp.protect(kHpHead, head, tid);
            if (lhead->deqidx.load() >= lhead->enqidx.load() && lhead->next.load() == nullptr) break;
            const int idx = lhead->deqidx.fetch_add(1);
            if (idx > BUFFER_SIZE-1) { // This node has been drained, check if there is another one
                Node* lnext = lhead->next.load();
                if (lnext == nullptr) break;  // No more nodes in the queue
                if (casHead(lhead, lnext)) hp.retire(lhead, tid);
                continue;
            }
            T* item = lhead->items[idx].load();
            if (item != nullptr) {
                hp.clear(tid);
                return item;
            }
            item = lhead->items[idx].exchange(taken);
            if (item == nullptr) continue;
            hp.clear(tid);
            return item;
        }
        hp.clear(tid);
        return nullptr;
    }
};

#endif /* _FAA_ARRAY_QUEUE_HP_H_ */


================================================
FILE: datastructures/queues/HazardPointers.hpp
================================================
/******************************************************************************
 * Copyright (c) 2014-2016, Pedro Ramalhete, Andreia Correia
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of Concurrency Freaks nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.

 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************************************************************
 */

#ifndef _HAZARD_POINTERS_H_
#define _HAZARD_POINTERS_H_

#include <atomic>
#include <iostream>
#include <functional>
#include <vector>


template<typename T>
class HazardPointers {

private:
    static const int      HP_MAX_THREADS = 128;
    static const int      HP_MAX_HPS = 128;     // This is named 'K' in the HP paper
    static const int      CLPAD = 128/sizeof(std::atomic<T*>);
    static const int      HP_THRESHOLD_R = 0; // This is named 'R' in the HP paper
    static const int      MAX_RETIRED = HP_MAX_THREADS*HP_MAX_HPS; // Maximum number of retired objects per thread

    const int             maxHPs;
    const int             maxThreads;

    alignas(128) std::atomic<T*>*      hp[HP_MAX_THREADS];
    // It's not nice that we have a lot of empty vectors, but we need padding to avoid false sharing
    alignas(128) std::vector<T*>       retiredList[HP_MAX_THREADS*CLPAD];
    std::function<void(T*,int)> defdeleter = [](T* t, int tid){ delete t; };
    std::function<void(T*,int)>& deleter;
public:

    HazardPointers(int maxHPs, int maxThreads) : maxHPs{maxHPs}, maxThreads{maxThreads}, deleter{defdeleter} {
        for (int ithread = 0; ithread < HP_MAX_THREADS; ithread++) {
            hp[ithread] = new std::atomic<T*>[HP_MAX_HPS];
            for (int ihp = 0; ihp < HP_MAX_HPS; ihp++) {
                hp[ithread][ihp].store(nullptr, std::memory_order_relaxed);
            }
        }
    }

    HazardPointers(int maxHPs, int maxThreads, std::function<void(T*,int)>& deleter) : maxHPs{maxHPs}, maxThreads{maxThreads}, deleter{deleter} {
        for (int ithread = 0; ithread < HP_MAX_THREADS; ithread++) {
            hp[ithread] = new std::atomic<T*>[HP_MAX_HPS];
            for (int ihp = 0; ihp < HP_MAX_HPS; ihp++) {
                hp[ithread][ihp].store(nullptr, std::memory_order_relaxed);
            }
        }
    }

    ~HazardPointers() {
        for (int ithread = 0; ithread < HP_MAX_THREADS; ithread++) {
            delete[] hp[ithread];
            // Clear the current retired nodes
            for (unsigned iret = 0; iret < retiredList[ithread*CLPAD].size(); iret++) {
                delete retiredList[ithread*CLPAD][iret];
            }
        }
    }


    /**
     * Progress Condition: wait-free bounded (by maxHPs)
     */
    void clear(const int tid) {
        for (int ihp = 0; ihp < maxHPs; ihp++) {
            hp[tid][ihp].store(nullptr, std::memory_order_release);
        }
    }


    /**
     * Progress Condition: wait-free population oblivious
     */
    void clearOne(int ihp, const int tid) {
        hp[tid][ihp].store(nullptr, std::memory_order_release);
    }


    /**
     * Progress Condition: lock-free
     */
    T* protect(int index, const std::atomic<T*>& atom, const int tid) {
        T* n = nullptr;
        T* ret;
		while ((ret = atom.load()) != n) {
			hp[tid][index].store(ret);
			n = ret;
		}
		return ret;
    }

    T* get(int index, const int tid){
        return hp[tid][index].load();
    }
    /**
     * This returns the same value that is passed as ptr, which is sometimes useful
     * Progress Condition: wait-free population oblivious
     */
    T* protectPtr(int index, T* ptr, const int tid) {
        hp[tid][index].store(ptr);
        return ptr;
    }


    /**
     * This returns the same value that is passed as ptr, which is sometimes useful
     * Progress Condition: wait-free population oblivious
     */
    T* protectPtrRelease(int index, T* ptr, const int tid) {
        hp[tid][index].store(ptr, std::memory_order_release);
        return ptr;
    }


    /**
     * Progress Condition: wait-free bounded (by the number of threads squared)
     */
    void retire(T* ptr, const int tid) {
        retiredList[tid*CLPAD].push_back(ptr);
        if (retiredList[tid*CLPAD].size() < HP_THRESHOLD_R) return;
        for (unsigned iret = 0; iret < retiredList[tid*CLPAD].size();) {
            auto obj = retiredList[tid*CLPAD][iret];
            bool canDelete = true;
            for (int tid = 0; tid < maxThreads && canDelete; tid++) {
                for (int ihp = maxHPs-1; ihp >= 0; ihp--) {
                    if (hp[tid][ihp].load() == obj) {
                        canDelete = false;
                        break;
                    }
                }
            }
            if (canDelete) {
                retiredList[tid*CLPAD].erase(retiredList[tid*CLPAD].begin() + iret);
                deleter(obj,tid);
                continue;
            }
            iret++;
        }
    }
};

#endif /* _HAZARD_POINTERS_H_ */


================================================
FILE: datastructures/queues/HazardPointersSimQueue.hpp
================================================
/******************************************************************************
 * Copyright (c) 2014-2016, Pedro Ramalhete, Andreia Correia
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of Concurrency Freaks nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.

 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************************************************************
 */

#ifndef _HAZARD_POINTERS_SIM_QUEUE_H_
#define _HAZARD_POINTERS_SIM_QUEUE_H_

#include <atomic>
#include <iostream>
#include <functional>
#include <vector>


// TODO: use std::vector instead of arrays for the retired objects (keep the padding)
template<typename T>
class HazardPointersSimQueue {

private:
    static const int      HP_MAX_THREADS = 128;
    static const int      HP_MAX_HPS = 11;     // This is named 'K' in the HP paper
    static const int      CLPAD = 128/sizeof(std::atomic<T*>);
    static const int      HP_THRESHOLD_R = 0; // This is named 'R' in the HP paper
    static const int      MAX_RETIRED = HP_MAX_THREADS*HP_MAX_HPS; // Maximum number of retired objects per thread

    const int             maxHPs;
    const int             maxThreads;

    std::atomic<T*>       hp[HP_MAX_THREADS*CLPAD][HP_MAX_HPS];
    // It's not nice that we have a lot of empty vectors, but we need padding to avoid false sharing
    std::vector<T*>       retiredList[HP_MAX_THREADS*CLPAD];

    std::function<bool(T*)> findPtr;

public:
    HazardPointersSimQueue(std::function<bool(T*)>& find, int maxHPs=HP_MAX_HPS, int maxThreads=HP_MAX_THREADS) : maxHPs{maxHPs}, maxThreads{maxThreads} {
        findPtr = find;
        for (int ithread = 0; ithread < HP_MAX_THREADS; ithread++) {
            for (int ihp = 0; ihp < HP_MAX_HPS; ihp++) {
                hp[ithread*CLPAD][ihp].store(nullptr, std::memory_order_relaxed);
            }
        }
    }

    ~HazardPointersSimQueue() {
        for (int ithread = 0; ithread < HP_MAX_THREADS; ithread++) {
            // Clear the current retired nodes
            for (unsigned iret = 0; iret < retiredList[ithread*CLPAD].size(); iret++) {
                delete retiredList[ithread*CLPAD][iret];
            }
        }
    }


    /**
     * Progress Condition: wait-free bounded (by maxHPs)
     */
    void clear(const int tid) {
        for (int ihp = 0; ihp < maxHPs; ihp++) {
            hp[tid*CLPAD][ihp].store(nullptr, std::memory_order_release);
        }
    }


    /**
     * Progress Condition: wait-free population oblivious
     */
    void clearOne(int ihp, const int tid) {
        hp[tid*CLPAD][ihp].store(nullptr, std::memory_order_release);
    }


    /**
     * Progress Condition: lock-free
     */
    T* protect(int index, const std::atomic<T*>& atom, const int tid) {
        T* n = nullptr;
        T* ret;
		while ((ret = atom.load()) != n) {
			hp[tid*CLPAD][index].store(ret);
			n = ret;
		}
		return ret;
    }

    /**
     * This returns the same value that is passed as ptr, which is sometimes useful
     * Progress Condition: wait-free population oblivious
     */
    T* protectPtr(int index, T* ptr, const int tid) {
        hp[tid*CLPAD][index].store(ptr);
        return ptr;
    }


    /**
     * This returns the same value that is passed as ptr, which is sometimes useful
     * Progress Condition: wait-free population oblivious
     */
    T* protectRelease(int index, T* ptr, const int tid) {
        hp[tid*CLPAD][index].store(ptr, std::memory_order_release);
        return ptr;
    }


    /**
     * This returns the same value that is passed as ptr, which is sometimes useful
     * Progress Condition: wait-free bounded (by the number of threads squared)
     */
    void retire(T* ptr, const int tid) {
        retiredList[tid*CLPAD].push_back(ptr);
        for (unsigned iret = 0; iret < retiredList[tid*CLPAD].size();) {
            auto obj = retiredList[tid*CLPAD][iret];
            if (findPtr(obj)) {
                iret++;
                continue;
            }
            bool canDelete = true;
            for (int tid = 0; tid < maxThreads && canDelete; tid++) {
                for (int ihp = maxHPs-1; ihp >= 0; ihp--) {
                    if (hp[tid*CLPAD][ihp].load() == obj) {
                        canDelete = false;
                        break;
                    }
                }
            }
            if (canDelete) {
                retiredList[tid*CLPAD].erase(retiredList[tid*CLPAD].begin() + iret);
                delete obj;
                continue;
            }
            iret++;
        }
    }
};

#endif /* _HAZARD_POINTERS_H_ */


================================================
FILE: datastructures/queues/LCRQueue.hpp
================================================
/******************************************************************************
 * Copyright (c) 2014-2016, Pedro Ramalhete, Andreia Correia
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of Concurrency Freaks nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************************************************************
 */

#ifndef _LCRQ_QUEUE_HP_H_
#define _LCRQ_QUEUE_HP_H_


#include <atomic>
#include "HazardPointers.hpp"

// CAS2 macro

#define __CAS2(ptr, o1, o2, n1, n2)                             \
({                                                              \
    char __ret;                                                 \
    __typeof__(o2) __junk;                                      \
    __typeof__(*(ptr)) __old1 = (o1);                           \
    __typeof__(o2) __old2 = (o2);                               \
    __typeof__(*(ptr)) __new1 = (n1);                           \
    __typeof__(o2) __new2 = (n2);                               \
    asm volatile("lock cmpxchg16b %2;setz %1"                   \
                   : "=d"(__junk), "=a"(__ret), "+m" (*ptr)     \
                   : "b"(__new1), "c"(__new2),                  \
                     "a"(__old1), "d"(__old2));                 \
    __ret; })

#define CAS2(ptr, o1, o2, n1, n2)    __CAS2(ptr, o1, o2, n1, n2)


#define BIT_TEST_AND_SET(ptr, b)                                \
({                                                              \
    char __ret;                                                 \
    asm volatile("lock btsq $63, %0; setnc %1" : "+m"(*ptr), "=a"(__ret) : : "cc"); \
    __ret;                                                      \
})


/**
 * <h1> LCRQ Queue </h1>
 *
 * This is LCRQ by Adam Morrison and Yehuda Afek
 * http://www.cs.tau.ac.il/~mad/publications/ppopp2013-x86queues.pdf
 *
 * This implementation does NOT obey the C++ memory model rules AND it is x86 specific.
 * No guarantees are given on the correctness or consistency of the results if you use this queue.
 *
 * Bugs fixed:
 * tt was not initialized in dequeue();
 *
 * <p>
 * enqueue algorithm: MS enqueue + LCRQ with re-usage
 * dequeue algorithm: MS dequeue + LCRQ with re-usage
 * Consistency: Linearizable
 * enqueue() progress: lock-free
 * dequeue() progress: lock-free
 * Memory Reclamation: Hazard Pointers (lock-free)
 *
 * <p>
 * The paper on Hazard Pointers is named "Hazard Pointers: Safe Memory
 * Reclamation for Lock-Free objects" and it is available here:
 * http://web.cecs.pdx.edu/~walpole/class/cs510/papers/11.pdf
 *
 * @author Pedro Ramalhete
 * @author Andreia Correia
 */
template<typename T>
class LCRQueue {

private:
    static const int RING_POW = 10;
    static const uint64_t RING_SIZE = 1ull << RING_POW;

    struct Cell {
        std::atomic<T*>       val;
        std::atomic<uint64_t> idx;
        uint64_t pad[14];
    } __attribute__ ((aligned (128)));

    struct Node {
        std::atomic<int64_t> head  __attribute__ ((aligned (128)));
        std::atomic<int64_t> tail  __attribute__ ((aligned (128)));
        std::atomic<Node*> next    __attribute__ ((aligned (128)));
        Cell array[RING_SIZE];

        Node() {
            for (unsigned i = 0; i < RING_SIZE; i++) {
                array[i].val.store(nullptr, std::memory_order_relaxed);
                array[i].idx.store(i, std::memory_order_relaxed);
            }
            head.store(0, std::memory_order_relaxed);
            tail.store(0, std::memory_order_relaxed);
            next.store(nullptr, std::memory_order_relaxed);
        }
    };

    alignas(128) std::atomic<Node*> head;
    alignas(128) std::atomic<Node*> tail;

    static const int MAX_THREADS = 128;
    const int maxThreads;

    HazardPointers<Node> hp {1, maxThreads};
    const int kHpTail = 0;
    const int kHpHead = 0;


    /*
     * Private methods
     */
    int is_empty(T* v)  {
        return (v == nullptr);
    }

    uint64_t node_index(uint64_t i) {
        return (i & ~(1ull << 63));
    }

    uint64_t set_unsafe(uint64_t i) {
        return (i | (1ull << 63));
    }

    uint64_t node_unsafe(uint64_t i) {
        return (i & (1ull << 63));
    }

    inline uint64_t tail_index(uint64_t t) {
        return (t & ~(1ull << 63));
    }

    int crq_is_closed(uint64_t t) {
        return (t & (1ull << 63)) != 0;
    }

    void fixState(Node *lhead) {
        while (1) {
            uint64_t t = lhead->tail.fetch_add(0);
            uint64_t h = lhead->head.fetch_add(0);
            // TODO: is it ok or not to cast "t" to int64_t ?
            if (lhead->tail.load() != (int64_t)t) continue;
            if (h > t) {
                int64_t tmp = t;
                if (lhead->tail.compare_exchange_strong(tmp, h)) break;
                continue;
            }
            break;
        }
    }

    int close_crq(Node *rq, const uint64_t tailticket, const int tries) {
        if (tries < 10) {
            int64_t tmp = tailticket + 1;
            return rq->tail.compare_exchange_strong(tmp, (tailticket + 1)|(1ull<<63));
        }
        else {
            return BIT_TEST_AND_SET(&rq->tail, 63);
        }
    }


public:
    LCRQueue(int maxThreads=MAX_THREADS) : maxThreads{maxThreads} {
        // Shared object init
        Node *sentinel = new Node;
        head.store(sentinel, std::memory_order_relaxed);
        tail.store(sentinel, std::memory_order_relaxed);
    }


    ~LCRQueue() {
        while (dequeue(0) != nullptr); // Drain the queue
        delete head.load();            // Delete the last node
    }

    static std::string className() { return "LCRQueue"; }


    void enqueue(T* item, const int tid) {
        int try_close = 0;
        while (true) {
            Node* ltail = hp.protectPtr(kHpTail, tail.load(), tid);
            if (ltail != tail.load()) continue;
            Node *lnext = ltail->next.load();
            if (lnext != nullptr) {  // Help advance the tail
                tail.compare_exchange_strong(ltail, lnext);
                continue;
            }

            uint64_t tailticket = ltail->tail.fetch_add(1);
            if (crq_is_closed(tailticket)) {
                Node* newNode = new Node();
                // Solo enqueue (superfluous?)
                newNode->tail.store(1, std::memory_order_relaxed);
                newNode->array[0].val.store(item, std::memory_order_relaxed);
                newNode->array[0].idx.store(0, std::memory_order_relaxed);
                Node* nullnode = nullptr;
                if (ltail->next.compare_exchange_strong(nullnode, newNode)) {// Insert new ring
                    tail.compare_exchange_strong(ltail, newNode); // Advance the tail
                    hp.clear(tid);
                    return;
                }
                delete newNode;
                continue;
            }
            Cell* cell = &ltail->array[tailticket & (RING_SIZE-1)];
            uint64_t idx = cell->idx.load();
            if (cell->val.load() == nullptr) {
                if (node_index(idx) <= tailticket) {
                    // TODO: is the missing cast before "t" ok or not to add?
                    if ((!node_unsafe(idx) || ltail->head.load() < (int64_t)tailticket)) {
                        if (CAS2((void**)cell, nullptr, idx, item, tailticket)) {
                            hp.clear(tid);
                            return;
                        }
                    }
                }
            }
            if (((int64_t)(tailticket - ltail->head.load()) >= (int64_t)RING_SIZE) && close_crq(ltail, tailticket, ++try_close)) continue;
        }
    }


    T* dequeue(const int tid) {
        while (true) {
            Node* lhead = hp.protectPtr(kHpHead, head.load(), tid);
            if (lhead != head.load()) continue;
            uint64_t headticket = lhead->head.fetch_add(1);
            Cell* cell = &lhead->array[headticket & (RING_SIZE-1)];

            int r = 0;
            uint64_t tt = 0;

            while (true) {
                uint64_t cell_idx = cell->idx.load();
                uint64_t unsafe = node_unsafe(cell_idx);
                uint64_t idx = node_index(cell_idx);
                T* val = cell->val.load();

                if (idx > headticket) break;

                if (val != nullptr) {
                    if (idx == headticket) {
                        if (CAS2((void**)cell, val, cell_idx, nullptr, unsafe | (headticket + RING_SIZE))) {
                            hp.clear(tid);
                            return val;
                        }
                    } else {
                        if (CAS2((void**)cell, val, cell_idx, val, set_unsafe(idx))) break;
                    }
                } else {
                    if ((r & ((1ull << 10) - 1)) == 0) tt = lhead->tail.load();
                    // Optimization: try to bail quickly if queue is closed.
                    int crq_closed = crq_is_closed(tt);
                    uint64_t t = tail_index(tt);
                    if (unsafe) { // Nothing to do, move along
                        if (CAS2((void**)cell, val, cell_idx, val, unsafe | (headticket + RING_SIZE)))
                            break;
                    } else if (t < headticket + 1 || r > 200000 || crq_closed) {
                        if (CAS2((void**)cell, val, idx, val, headticket + RING_SIZE)) {
                            if (r > 200000 && tt > RING_SIZE) BIT_TEST_AND_SET(&lhead->tail, 63);
                            break;
                        }
                    } else {
                        ++r;
                    }
                }
            }

            if (tail_index(lhead->tail.load()) <= headticket + 1) {
                fixState(lhead);
                // try to return empty
                Node* lnext = lhead->next.load();
                if (lnext == nullptr) {
                    hp.clear(tid);
                    return nullptr;  // Queue is empty
                }
                if (tail_index(lhead->tail) <= headticket + 1) {
                    if (head.compare_exchange_strong(lhead, lnext)) hp.retire(lhead, tid);
                }
            }
        }
    }
};

#endif /* _LCRQ_QUEUE_HP_H_ */


================================================
FILE: datastructures/queues/MichaelScottQueue.hpp
================================================
/******************************************************************************
 * Copyright (c) 2014-2016, Pedro Ramalhete, Andreia Correia
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of Concurrency Freaks nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************************************************************
 */

#ifndef _MICHAEL_SCOTT_QUEUE_HP_H_
#define _MICHAEL_SCOTT_QUEUE_HP_H_

#include <atomic>
#include <stdexcept>
#include "HazardPointers.hpp"


/**
 * <h1> Michael-Scott Queue </h1>
 *
 * enqueue algorithm: MS enqueue
 * dequeue algorithm: MS dequeue
 * Consistency: Linearizable
 * enqueue() progress: lock-free
 * dequeue() progress: lock-free
 * Memory Reclamation: Hazard Pointers (lock-free)
 *
 *
 * Maged Michael and Michael Scott's Queue with Hazard Pointers
 * <p>
 * Lock-Free Linked List as described in Maged Michael and Michael Scott's paper:
 * {@link http://www.cs.rochester.edu/~scott/papers/1996_PODC_queues.pdf}
 * <a href="http://www.cs.rochester.edu/~scott/papers/1996_PODC_queues.pdf">
 * Simple, Fast, and Practical Non-Blocking and Blocking Concurrent Queue Algorithms</a>
 * <p>
 * The paper on Hazard Pointers is named "Hazard Pointers: Safe Memory
 * Reclamation for Lock-Free objects" and it is available here:
 * http://web.cecs.pdx.edu/~walpole/class/cs510/papers/11.pdf
 *
 */
template<typename T>
class MichaelScottQueue {

private:
    struct Node {
        T* item;
        std::atomic<Node*> next;

        Node(T* userItem) : item{userItem}, next{nullptr} { }

        bool casNext(Node *cmp, Node *val) {
            return next.compare_exchange_strong(cmp, val);
        }
    };

    bool casTail(Node *cmp, Node *val) {
		return tail.compare_exchange_strong(cmp, val);
	}

    bool casHead(Node *cmp, Node *val) {
        return head.compare_exchange_strong(cmp, val);
    }

    // Pointers to head and tail of the list
    alignas(128) std::atomic<Node*> head;
    alignas(128) std::atomic<Node*> tail;

    static const int MAX_THREADS = 128;
    const int maxThreads;

    // We need two hazard pointers for dequeue()
    HazardPointers<Node> hp {2, maxThreads};
    const int kHpTail = 0;
    const int kHpHead = 0;
    const int kHpNext = 1;

public:
    MichaelScottQueue(int maxThreads=MAX_THREADS) : maxThreads{maxThreads} {
        Node* sentinelNode = new Node(nullptr);
        head.store(sentinelNode, std::memory_order_relaxed);
        tail.store(sentinelNode, std::memory_order_relaxed);
    }


    ~MichaelScottQueue() {
        while (dequeue(0) != nullptr); // Drain the queue
        delete head.load();            // Delete the last node
    }

    static std::string className() { return "MichaelScottQueue"; }

    void enqueue(T* item, const int tid) {
        if (item == nullptr) throw std::invalid_argument("item can not be nullptr");
        Node* newNode = new Node(item);
        while (true) {
            Node* ltail = hp.protectPtr(kHpTail, tail, tid);
            if (ltail == tail.load()) {
                Node* lnext = ltail->next.load();
                if (lnext == nullptr) {
                    // It seems this is the last node, so add the newNode here
                    // and try to move the tail to the newNode
                    if (ltail->casNext(nullptr, newNode)) {
                        casTail(ltail, newNode);
                        hp.clear(tid);
                        return;
                    }
                } else {
                    casTail(ltail, lnext);
                }
            }
        }
    }


    T* dequeue(const int tid) {
        Node* node = hp.protect(kHpHead, head, tid);
        while (node != tail.load()) {
            Node* lnext = hp.protect(kHpNext, node->next, tid);
            if (casHead(node, lnext)) {
                T* item = lnext->item;  // Another thread may clean up lnext after we do hp.clear()
                hp.clear(tid);
                hp.retire(node, tid);
                return item;
            }
            node = hp.protect(kHpHead, head, tid);
        }
        hp.clear(tid);
        return nullptr;                  // Queue is empty
    }
};

#endif /* _MICHAEL_SCOTT_QUEUE_HP_H_ */


================================================
FILE: datastructures/queues/OFLFArrayLinkedListQueue.hpp
================================================
/*
 * Copyright 2017-2018
 *   Andreia Correia <andreia.veiga@unine.ch>
 *   Pedro Ramalhete <pramalhe@gmail.com>
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Nachshon Cohen <nachshonc@gmail.com>
 *
 * This work is published under the MIT license. See LICENSE.txt
 */
#ifndef _OF_LF_ARRAY_LINKED_LIST_QUEUE_H_
#define _OF_LF_ARRAY_LINKED_LIST_QUEUE_H_

#include <atomic>
#include <stdexcept>

#include "stms/OneFileLF.hpp"

/**
 * <h1> An Array Linked List Queue using OneFile STM (Lock-Free) </h1>
 *
 * TODO
 *
 *
 * enqueue algorithm: sequential implementation + MWC
 * dequeue algorithm: sequential implementation + MWC
 * Consistency: Linearizable
 * enqueue() progress: lock-free
 * dequeue() progress: lock-free
 * Memory Reclamation: Hazard Eras (integrated into MWC)
 * enqueue min ops: 2 DCAS + 1 CAS
 * dequeue min ops: 1 DCAS + 1 CAS
 */
template<typename T>
class OFLFArrayLinkedListQueue : public oflf::tmbase {

private:
    /*
    struct cell {
        onefilelf::tmtype<T*> val;
    } __attribute__ ((aligned (128)));
    */

    struct Node : oflf::tmbase {
        static const int ITEM_NUM = 1024;   // TODO: use a larger ring buffer size here, 1024 for example
        oflf::tmtype<uint64_t> headidx {0};
        //cell                    items[ITEM_NUM];
        oflf::tmtype<T*>       items[ITEM_NUM];
        oflf::tmtype<uint64_t> tailidx {0};
        oflf::tmtype<Node*>    next {nullptr};
        Node(T* item) {
            items[0] = item;
            tailidx = 1;
            headidx = 0;
            for (int i = 1; i < ITEM_NUM; i++) items[i] = nullptr;
        }
    };

    oflf::tmtype<Node*>  head {nullptr};
    oflf::tmtype<Node*>  tail {nullptr};


public:
    OFLFArrayLinkedListQueue(unsigned int maxThreads=0) {
        Node* sentinelNode = new Node(nullptr);
        sentinelNode->tailidx = 0;
        head = sentinelNode;
        tail = sentinelNode;
    }


    ~OFLFArrayLinkedListQueue() {
        while (dequeue(0) != nullptr); // Drain the queue
        Node* lhead = head;
        delete lhead;
    }


    static std::string className() { return "OF-LF-ArrayLinkedListQueue"; }


    /*
     * Progress Condition: lock-free
     * Always returns true
     */
    bool enqueue(T* item, const int tid=0) {
        if (item == nullptr) throw std::invalid_argument("item can not be nullptr");
        return oflf::updateTx<bool>([this,item] () -> bool {
            Node* ltail = tail;
            uint64_t ltailidx = ltail->tailidx;
            if (ltailidx < Node::ITEM_NUM) {
                ltail->items[ltailidx] = item;
                ++ltail->tailidx;
                return true;
            }
            Node* newNode = oflf::tmNew<Node>(item);
            tail->next = newNode;
            tail = newNode;
            return true;
        });
    }


    /*
     * Progress Condition: lock-free
     */
    T* dequeue(const int tid=0) {
        return oflf::updateTx<T*>([this] () -> T* {
            Node* lhead = head;
            uint64_t lheadidx = lhead->headidx;
            // Check if queue is empty
            if (lhead == tail && lheadidx == tail->tailidx) return nullptr;
            if (lheadidx < Node::ITEM_NUM) {
                ++lhead->headidx;
                return lhead->items[lheadidx];
            }
            lhead = lhead->next;
            oflf::tmDelete<Node>(head);
            head = lhead;
            ++lhead->headidx;
            return lhead->items[0];
        });
    }
};

#endif /* _OF_LF_ARRAY_LINKED_LIST_QUEUE_H_ */


================================================
FILE: datastructures/queues/OFLFArrayQueue.hpp
================================================
/*
 * Copyright 2017-2018
 *   Andreia Correia <andreia.veiga@unine.ch>
 *   Pedro Ramalhete <pramalhe@gmail.com>
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Nachshon Cohen <nachshonc@gmail.com>
 *
 * This work is published under the MIT license. See LICENSE.txt
 */
#ifndef _OFLF_STM_ARRAY_QUEUE_H_
#define _OFLF_STM_ARRAY_QUEUE_H_

#include <atomic>
#include <stdexcept>

#include "stms/OneFileLF.hpp"

/**
 * <h1> An Array Queue </h1>
 *
 */
template<typename T>
class OFLFArrayQueue : public oflf::tmbase {

private:
    static const int MAX_ITEMS = 2048;
    oflf::tmtype<uint64_t> headidx {0};
    oflf::tmtype<T*>       items[MAX_ITEMS];
    oflf::tmtype<uint64_t> tailidx {0};


public:
    OFLFArrayQueue(unsigned int maxThreads=0) {
        oflf::updateTx<bool>([this] () {
            for (int i = 0; i < MAX_ITEMS; i++) items[i] = nullptr;
            return true;
        });
    }


    ~OFLFArrayQueue() { }


    static std::string className() { return "OF-LF-ArrayQueue"; }


    /*
     * Progress Condition: blocking
     * Always returns true
     */
    bool enqueue(T* item, const int tid=0) {
        if (item == nullptr) throw std::invalid_argument("item can not be nullptr");
        return oflf::updateTx<bool>([this,item] () -> bool {
            if (tailidx >= headidx+MAX_ITEMS) return false; // queue is full
            items[tailidx % MAX_ITEMS] = item;
            ++tailidx;
            return true;
        });
    }


    /*
     * Progress Condition: blocking
     */
    T* dequeue(const int tid=0) {
        return oflf::updateTx<T*>([this] () -> T* {
            if (tailidx == headidx) return nullptr; // queue is empty
            T* item = items[headidx % MAX_ITEMS];
            ++headidx;
            return item;
        });
    }
};

#endif /* _OF_LF_STM_ARRAY_QUEUE_H_ */


================================================
FILE: datastructures/queues/OFLFLinkedListQueue.hpp
================================================
/*
 * Copyright 2017-2018
 *   Andreia Correia <andreia.veiga@unine.ch>
 *   Pedro Ramalhete <pramalhe@gmail.com>
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Nachshon Cohen <nachshonc@gmail.com>
 *
 * This work is published under the MIT license. See LICENSE.txt
 */
#ifndef _OF_LF_LINKED_LIST_QUEUE_H_
#define _OF_LF_LINKED_LIST_QUEUE_H_

#include <atomic>
#include <stdexcept>

#include "stms/OneFileLF.hpp"

/**
 * <h1> A Linked List queue using OneFile STM (Lock-Free) </h1>
 *
 * enqueue algorithm: sequential implementation + OFLF
 * dequeue algorithm: sequential implementation + OFLF
 * Consistency: Linearizable
 * enqueue() progress: lock-free
 * dequeue() progress: lock-free
 * Memory Reclamation: lock-free Hazard Eras (integrated into OFLF)
 * enqueue min ops: 2 DCAS + 1 CAS
 * dequeue min ops: 1 DCAS + 1 CAS
 */
template<typename T>
class OFLFLinkedListQueue : public oflf::tmbase {

private:
    struct Node : oflf::tmbase {
        T* item;
        oflf::tmtype<Node*> next {nullptr};
        Node(T* userItem) : item{userItem} { }
    };

    oflf::tmtype<Node*>  head {nullptr};
    oflf::tmtype<Node*>  tail {nullptr};


public:
    OFLFLinkedListQueue(unsigned int maxThreads=0) {
        Node* sentinelNode = oflf::tmNew<Node>(nullptr);
        head = sentinelNode;
        tail = sentinelNode;
    }


    ~OFLFLinkedListQueue() {
        while (dequeue() != nullptr); // Drain the queue
        Node* lhead = head;
        oflf::tmDelete(lhead);
    }


    static std::string className() { return "OF-LF-LinkedListQueue"; }


    /*
     * Progress Condition: lock-free
     * Always returns true
     */
    bool enqueue(T* item, const int tid=0) {
        if (item == nullptr) throw std::invalid_argument("item can not be nullptr");
        Node* newNode = oflf::tmNew<Node>(item); // Let's allocate outside the transaction, less overhead
        return oflf::updateTx<bool>([this,newNode] () -> bool {
            tail->next = newNode;
            tail = newNode;
            return true;
        });
    }


    /*
     * Progress Condition: lock-free
     */
    T* dequeue(const int tid=0) {
        return oflf::updateTx<T*>([this] () -> T* {
            Node* lhead = head;
            if (lhead == tail) return nullptr;
            head = lhead->next;
            oflf::tmDelete(lhead);
            return head->item;
        });
    }
};

#endif /* _OF_LF_LINKED_LIST_QUEUE_H_ */


================================================
FILE: datastructures/queues/OFWFArrayLinkedListQueue.hpp
================================================
/*
 * Copyright 2017-2018
 *   Andreia Correia <andreia.veiga@unine.ch>
 *   Pedro Ramalhete <pramalhe@gmail.com>
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Nachshon Cohen <nachshonc@gmail.com>
 *
 * This work is published under the MIT license. See LICENSE.txt
 */
#ifndef _OF_WF_ARRAY_LINKED_LIST_QUEUE_H_
#define _OF_WF_ARRAY_LINKED_LIST_QUEUE_H_

#include <atomic>
#include <stdexcept>

#include "stms/OneFileWF.hpp"

/**
 * <h1> An Array Linked List Queue using OneFile STM (Wait-Free) </h1>
 *
 * TODO
 *
 *
 * enqueue algorithm: sequential implementation + MWC
 * dequeue algorithm: sequential implementation + MWC
 * Consistency: Linearizable
 * enqueue() progress: lock-free
 * dequeue() progress: lock-free
 * Memory Reclamation: Hazard Eras (integrated into MWC)
 * enqueue min ops: 2 DCAS + 1 CAS
 * dequeue min ops: 1 DCAS + 1 CAS
 */
template<typename T>
class OFWFArrayLinkedListQueue  : public ofwf::tmbase {

private:
    struct Node : ofwf::tmbase {
        static const int ITEM_NUM = 1024;
        ofwf::tmtype<uint64_t> headidx {0};
        ofwf::tmtype<T*>       items[ITEM_NUM];
        ofwf::tmtype<uint64_t> tailidx {0};
        ofwf::tmtype<Node*>    next {nullptr};
        Node(T* item) {
            items[0] = item;
            tailidx = 1;
            headidx = 0;
            for (int i = 1; i < ITEM_NUM; i++) items[i] = nullptr;
        }
    };

    ofwf::tmtype<Node*>  head {nullptr};
    ofwf::tmtype<Node*>  tail {nullptr};


public:
    OFWFArrayLinkedListQueue(unsigned int maxThreads=0) {
        Node* sentinelNode = new Node(nullptr);
        sentinelNode->tailidx = 0;
        head = sentinelNode;
        tail = sentinelNode;
    }


    ~OFWFArrayLinkedListQueue() {
        while (dequeue(0) != nullptr); // Drain the queue
        Node* lhead = head;
        delete lhead;
    }


    static std::string className() { return "OF-WF-ArrayLinkedListQueue"; }


    /*
     * Progress Condition: lock-free
     * Always returns true
     */
    bool enqueue(T* item, const int tid=0) {
        if (item == nullptr) throw std::invalid_argument("item can not be nullptr");
        return ofwf::updateTx<bool>([this,item] () -> bool {
            Node* ltail = tail;
            uint64_t ltailidx = ltail->tailidx;
            if (ltailidx < Node::ITEM_NUM) {
                ltail->items[ltailidx] = item;
                ++ltail->tailidx;
                return true;
            }
            Node* newNode = ofwf::tmNew<Node>(item);
            tail->next = newNode;
            tail = newNode;
            return true;
        });
    }


    /*
     * Progress Condition: lock-free
     */
    T* dequeue(const int tid=0) {
        return ofwf::updateTx<T*>([this] () -> T* {
            Node* lhead = head;
            uint64_t lheadidx = lhead->headidx;
            // Check if queue is empty
            if (lhead == tail && lheadidx == tail->tailidx) return nullptr;
            if (lheadidx < Node::ITEM_NUM) {
                ++lhead->headidx;
                return lhead->items[lheadidx];
            }
            lhead = lhead->next;
            ofwf::tmDelete<Node>(head);
            head = lhead;
            ++lhead->headidx;
            return lhead->items[0];
        });
    }
};

#endif /* _OF_WF_ARRAY_LINKED_LIST_QUEUE_H_ */


================================================
FILE: datastructures/queues/OFWFLinkedListQueue.hpp
================================================
/*
 * Copyright 2017-2018
 *   Andreia Correia <andreia.veiga@unine.ch>
 *   Pedro Ramalhete <pramalhe@gmail.com>
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Nachshon Cohen <nachshonc@gmail.com>
 *
 * This work is published under the MIT license. See LICENSE.txt
 */
#ifndef _OF_WF_LINKED_LIST_QUEUE_H_
#define _OF_WF_LINKED_LIST_QUEUE_H_

#include <atomic>
#include <stdexcept>

#include "stms/OneFileWF.hpp"

/**
 * <h1> A Linked List queue using OneFile STM (Wait-Free) </h1>
 *
 * enqueue algorithm: sequential implementation + OFWF
 * dequeue algorithm: sequential implementation + OFWF
 * Consistency: Linearizable
 * enqueue() progress: wait-free
 * dequeue() progress: wait-free
 * Memory Reclamation: wait-free Hazard Eras (integrated into OFWF)
 * enqueue min ops: 3 DCAS + 1 CAS
 * dequeue min ops: 2 DCAS + 1 CAS
 */
template<typename T>
class OFWFLinkedListQueue : public ofwf::tmbase {

private:
    struct Node : ofwf::tmbase {
        T* item;
        ofwf::tmtype<Node*> next;
        Node(T* userItem) : item{userItem}, next{nullptr} { }
    };

    ofwf::tmtype<Node*>  head {nullptr};
    ofwf::tmtype<Node*>  tail {nullptr};


public:
    OFWFLinkedListQueue(unsigned int maxThreads=0) {
        Node* sentinelNode = ofwf::tmNew<Node>(nullptr);
        head = sentinelNode;
        tail = sentinelNode;
    }


    ~OFWFLinkedListQueue() {
        while (dequeue() != nullptr); // Drain the queue
        Node* lhead = head;
        ofwf::tmDelete(lhead);
    }


    static std::string className() { return "OF-WF-LinkedListQueue"; }


    /*
     * Progress Condition: wait-free bounded
     * Always returns true
     */
    bool enqueue(T* item, const int tid=0) {
        if (item == nullptr) throw std::invalid_argument("item can not be nullptr");
        Node* newNode = ofwf::tmNew<Node>(item); // Let's allocate outside the transaction, less overhead
        return ofwf::updateTx<bool>([this,newNode] () -> bool {
            tail->next = newNode;
            tail = newNode;
            return true;
        });
    }


    /*
     * Progress Condition: wait-free bounded
     */
    T* dequeue(const int tid=0) {
        return (T*)ofwf::updateTx<T*>([this] () -> T* {
            Node* lhead = head;
            if (lhead == tail) return nullptr;
            head = lhead->next;
            ofwf::tmDelete(lhead);
            return head->item;
        });
    }
};

#endif /* _OF_WF_LINKED_LIST_QUEUE_H_ */


================================================
FILE: datastructures/queues/README.md
================================================
# Queues #

This folder contains multiple multi-producer-multi-consumer queue implementations, all of them with integrated memory reclamation having the same progress condition:

- FAAArrayQueue: Memory unbounded, lock-free, one array per node, hazard pointers
http://...

- LCRQueue: Memory unbounded, lock-free, one array per node, hazard pointers, can re-use entries in some situations
http://

- OFLFLinkedListqueue: Memory unbounded, lock-free, one entry per node, hazard eras
Uses OneFile STM (Lock-Free)

- OFWFLinkedListqueue: Memory unbounded, wait-free bounded, one entry per node, hazard eras
Uses OneFile STM (Wait-Free)

- SimQueue: Memory unbounded, wait-free bounded, one entry per node, modified hazard pointers
http://

- TurnQueue: Memory unbounded, wait-free bounded, one entry per node, hazard pointers
http:// 


================================================
FILE: datastructures/queues/SimQueue.hpp
================================================
/******************************************************************************
 * Copyright (c) 2014-2016, Pedro Ramalhete, Andreia Correia
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of Concurrency Freaks nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************************************************************
 */

#ifndef _CR_SIM_QUEUE_HP_H_
#define _CR_SIM_QUEUE_HP_H_

#include <atomic>
#include <stdexcept>
#include "HazardPointersSimQueue.hpp"


/**
 * <h1> Sim Queue </h1>
 *
 * Based on the SimQueue (FK queue)
 *
 * http://thalis.cs.uoi.gr/tech_reports/publications/TR2011-01.pdf
 *
 * enqueue algorithm: P-Sim
 * dequeue algorithm: P-Sim
 * Consistency: Linearizable
 * enqueue() progress: wait-free bounded O(N_threads)
 * dequeue() progress: wait-free bounded O(N_threads)
 * Memory Reclamation: Hazard Pointers with custom scanner for Nodes. EnqState and DeqState re-usage.
 *
 * <p>
 * The paper on Hazard Pointers is named "Hazard Pointers: Safe Memory
 * Reclamation for Lock-Free objects" and it is available here:
 * http://web.cecs.pdx.edu/~walpole/class/cs510/papers/11.pdf
 *
 */
template<typename T>
class SimQueue {

private:
    static const int MAX_THREADS = 128;

    struct Node {
        T* item;
        std::atomic<Node*> next {nullptr};
        Node(T* item) : item{item} { }
    };


    struct EnqState {
        std::atomic<Node*>  tail {nullptr};         // link_a
        std::atomic<Node*>  nextNode {nullptr};     // link_b
        std::atomic<Node*>  nextTail {nullptr};     // ptr
        std::atomic<bool>   applied[MAX_THREADS];

        EnqState() {
            for(int i=0; i < MAX_THREADS; i++){
                applied[i].store(false, std::memory_order_relaxed);
            }
        }
    };


    struct DeqState {
        std::atomic<Node*>  head {nullptr};
        std::atomic<T*>     items[MAX_THREADS];
        std::atomic<bool>   applied[MAX_THREADS];

        DeqState() {
            for(int i=0; i < MAX_THREADS; i++){
                applied[i].store(false, std::memory_order_relaxed);
                items[i].store(nullptr, std::memory_order_relaxed);
            }
        }
    };


    typedef union pointer_t {
        struct StructData{
            int64_t seq : 48;
            int64_t index: 16;
        } u;           // struct_data
        int64_t raw;   // raw_data
    } pointer_t;


    const int maxThreads;

    alignas(128) std::atomic<pointer_t> enqPointer;
    alignas(128) std::atomic<pointer_t> deqPointer;
    // Enqueue requests
    alignas(128) std::atomic<T*> items[MAX_THREADS];          // Always access relaxed
    alignas(128) std::atomic<bool> enqueuers[MAX_THREADS];
    // Re-usable EnqState instances
    alignas(128) EnqState enqReused[MAX_THREADS*2];
    // Dequeue requests
    alignas(128) std::atomic<bool> dequeuers[MAX_THREADS];
    // Re-usable DeqState instances
    alignas(128) DeqState deqReused[MAX_THREADS*2];
    alignas(128) Node* pool[MAX_THREADS][MAX_THREADS];


    // Passed to Hazard Pointers
    std::function<bool(Node*)> find = [this](Node* ptr) {
        pointer_t lpointer = enqPointer.load();
        if (enqReused[lpointer.u.index].tail.load() == ptr) return true;
        /*
        lpointer = deqPointer.load();
        if (deqReused[lpointer.u.index].head.load() == ptr) return true;
        */
        return false;
    };

    HazardPointersSimQueue<Node>  hp {find, 1, maxThreads};
    const int kHpTail = 0;
    const int kHpNode = 0;

    Node* sentinel = new Node(nullptr);

public:
    SimQueue(int maxThreads=MAX_THREADS) : maxThreads(maxThreads) {
        for (int i = 0; i < maxThreads; i++) {
            enqueuers[i].store(false, std::memory_order_relaxed);
            dequeuers[i].store(false, std::memory_order_relaxed);
            for(int j=0;j<maxThreads;j++){
                pool[i][j]=new Node(nullptr);
            }
        }
        pointer_t temp;
        temp.u.seq = 0;
        temp.u.index = 0;
        enqPointer.store(temp);
        deqPointer.store(temp);
        enqReused[0].tail.store(sentinel, std::memory_order_relaxed);
        enqReused[0].nextTail.store(sentinel, std::memory_order_relaxed);
        deqReused[0].head.store(sentinel, std::memory_order_relaxed);
    }


    ~SimQueue() {
        while (dequeue(0) != nullptr); // Drain the queue
        delete deqReused[deqPointer.load().u.index].head.load();
        for (int i = 0; i < maxThreads; i++) {
            for (int j = 0; j < maxThreads; j++) {
                delete pool[i][j];
            }
        }
    }


    static std::string className() { return "SimQueue"; }


    /**
     * Progress condition: wait-free bounded
     */
    void enqueue(T* item, const int tid) {
        if (item == nullptr) throw std::invalid_argument("item can not be nullptr");
        // Publish enqueue request
        items[tid].store(item, std::memory_order_relaxed);
        bool newrequest = !enqueuers[tid].load(std::memory_order_relaxed);
        enqueuers[tid].store(newrequest);
        for (int iter = 0; iter < 3; iter++) {
            pointer_t lpointer = enqPointer.load();
            EnqState* const lstate = &enqReused[lpointer.u.index];
            Node* ltail = hp.protectPtr(kHpTail, lstate->tail.load(), tid);
            Node* lnext = lstate->nextNode.load(); // No need for HP because we don't dereference it
            Node* lnextTail = lstate->nextTail.load(); // No need for HP
            if (lpointer.raw != enqPointer.load().raw) continue;

            // Advance the tail if needed
            if (ltail->next.load() != lnext) {
                ltail->next.store(lnext, std::memory_order_release);
            }
            // Check if my request has been done
            if (lstate->applied[tid].load() == newrequest) {
                if (lpointer.raw == enqPointer.load().raw) break;
            }
            // Help opened enqueue requests, starting from zero
            Node* first = nullptr;
            Node* node = nullptr;
            const int myIndex = (lpointer.u.index == 2*tid) ? 2*tid+1 : 2*tid ;
            EnqState* const myState = &enqReused[myIndex];
            int numNodes = 0;
            for (int j = 0; j < maxThreads; j++) {
                // Check if it is an open request
                const bool enqj = enqueuers[j].load();
                myState->applied[j].store(enqj, std::memory_order_relaxed);
                if (enqj == lstate->applied[j].load()) continue;
                Node* prev = node;
                node = pool[tid][numNodes++];
                node->item = items[j].load(std::memory_order_relaxed);
                if (first == nullptr) {
                    first = node;
                } else {
                    prev->next.store(node, std::memory_order_relaxed);
                }
                if (lpointer.raw != enqPointer.load().raw) break;
            }

            // Try to apply the new sublist
            if (lpointer.raw != enqPointer.load().raw) continue;
            node->next.store(nullptr, std::memory_order_relaxed);
            myState->tail.store(lnextTail, std::memory_order_relaxed);
            myState->nextNode.store(first, std::memory_order_relaxed);
            myState->nextTail.store(node, std::memory_order_relaxed);
            pointer_t myPointer;
            myPointer.u.seq = lpointer.u.seq + 1;
            myPointer.u.index = myIndex;
            if (enqPointer.compare_exchange_strong(lpointer, myPointer)) {
                for (int k = 0; k < numNodes; k++) {   // Refill pool
                    pool[tid][k] = new Node(nullptr);
                }
            }
        }
        hp.clear(tid);
    }


    /**
     * Progress condition: wait-free bounded
     *
     * We use just one HP index, but it was though to get there.
     */
    T* dequeue(const int tid) {
        // Publish dequeue request
        bool newrequest = !dequeuers[tid].load(std::memory_order_relaxed);
        dequeuers[tid].store(newrequest);
        for (int iter = 0; iter < 2; iter++) {
            pointer_t lpointer = deqPointer.load();
            DeqState* lstate = &deqReused[lpointer.u.index];
            // Check if my request has been done
            if (lstate->applied[tid].load() == newrequest) {
                if (lpointer.raw == deqPointer.load().raw) break;
            }
            // Help opened dequeue requests, starting from turn+1
            Node* newHead = hp.protectPtr(kHpNode, lstate->head, tid);
            if (lpointer.raw != deqPointer.load().raw) continue;
            const int myIndex = (lpointer.u.index == 2*tid) ? 2*tid+1 : 2*tid ;
            DeqState* const myState = &deqReused[myIndex];
            Node* node = newHead;
            for (int j = 0; j < maxThreads; j++) {
                // Check if it is an open request
                const bool applied = lstate->applied[j].load();
                if (dequeuers[j].load() == applied) {
                    myState->items[j].store(lstate->items[j], std::memory_order_relaxed);
                    myState->applied[j].store(applied, std::memory_order_relaxed);
                    continue;
                }
                myState->applied[j].store(!applied, std::memory_order_relaxed);
                if (node->next.load() == nullptr) {
                    myState->items[j].store(nullptr,std::memory_order_relaxed);
                } else {
                    node = hp.protectPtr(kHpNode, node->next, tid);
                    if (lpointer.raw != deqPointer.load().raw) break;
                    myState->items[j].store(node->item, std::memory_order_relaxed);
                    newHead = node;
                }
            }
            if (lpointer.raw != deqPointer.load().raw) continue;
            pointer_t newDeqIndex;
            newDeqIndex.u.seq = lpointer.u.seq + 1;
            newDeqIndex.u.index = myIndex;
            myState->head.store(newHead, std::memory_order_relaxed);
            node = lstate->head;
            if (deqPointer.compare_exchange_strong(lpointer, newDeqIndex)) {
                while (node != newHead) {
                    Node* next = node->next.load();
                    hp.retire(node,tid);
                    node = next;
                }
                break;
            }
        }
        hp.clear(tid);
        return deqReused[deqPointer.load().u.index].items[tid].load();
    }
};

#endif /* _SIM_QUEUE_HP_H_ */


================================================
FILE: datastructures/queues/TinySTMArrayLinkedListQueue.hpp
================================================
/******************************************************************************
 * Copyright (c) 2014-2017, Pedro Ramalhete, Andreia Correia
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of Concurrency Freaks nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************************************************************
 */

#ifndef _TINY_STM_ARRAY_LINKED_LIST_QUEUE_H_
#define _TINY_STM_ARRAY_LINKED_LIST_QUEUE_H_

#include <atomic>
#include <stdexcept>

#include "stms/TinySTM.hpp"

/**
 * <h1> An Array Linked List Queue using Tiny STM </h1>
 */
template<typename T>
class TinySTMArrayLinkedListQueue {

private:
    struct Node : tinystm::tmbase {
        static const int ITEM_NUM = 1024;
        tinystm::tmtype<uint64_t> headidx {0};
        tinystm::tmtype<T*>       items[ITEM_NUM];
        tinystm::tmtype<uint64_t> tailidx {0};
        tinystm::tmtype<Node*>    next {nullptr};
        Node(T* item) {
            items[0] = item;
            tailidx = 1;
            headidx = 0;
            for (int i = 1; i < ITEM_NUM; i++) items[i] = nullptr;
        }
    };

    tinystm::tmtype<Node*>  head {nullptr};
    tinystm::tmtype<Node*>  tail {nullptr};


public:
    TinySTMArrayLinkedListQueue(unsigned int maxThreads=0) {
        tinystm::updateTx<bool>([this] () {
            Node* sentinelNode = tinystm::tmNew<Node>(nullptr);
            sentinelNode->tailidx = 0;
            head = sentinelNode;
            tail = sentinelNode;
            return true;
        });
    }


    ~TinySTMArrayLinkedListQueue() {
        while (dequeue() != nullptr); // Drain the queue
        tinystm::updateTx<bool>([this] () {
            Node* lhead = head;
            tinystm::tmDelete(lhead);
            return true;
        });
    }


    static std::string className() { return "TinySTM-ArrayLinkedListQueue"; }


    /*
     * Progress Condition: blocking
     * Always returns true
     */
    bool enqueue(T* item, const int tid=0) {
        if (item == nullptr) throw std::invalid_argument("item can not be nullptr");
        return tinystm::updateTx<bool>([this,item] () -> bool {
            Node* ltail = tail;
            uint64_t ltailidx = ltail->tailidx;
            if (ltailidx < Node::ITEM_NUM) {
                ltail->items[ltailidx] = item;
                ++ltail->tailidx;
                return true;
            }
            Node* newNode = tinystm::tmNew<Node>(item);
            tail->next = newNode;
            tail = newNode;
            return true;
        });
    }


    /*
     * Progress Condition: blocking
     */
    T* dequeue(const int tid=0) {
        return tinystm::updateTx<T*>([this] () -> T* {
            Node* lhead = head;
            uint64_t lheadidx = lhead->headidx;
            // Check if queue is empty
            if (lhead == tail && lheadidx == tail->tailidx) return nullptr;
            if (lheadidx < Node::ITEM_NUM) {
                ++lhead->headidx;
                return lhead->items[lheadidx];
            }
            lhead = lhead->next;
            tinystm::tmDelete(head.load());
            head = lhead;
            ++lhead->headidx;
            return lhead->items[0];
        });
    }
};

#endif /* _TINY_STM_ARRAY_LINKED_LIST_QUEUE_H_ */


================================================
FILE: datastructures/queues/TinySTMLinkedListQueue.hpp
================================================
/******************************************************************************
 * Copyright (c) 2014-2018, Pedro Ramalhete, Andreia Correia
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of Concurrency Freaks nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************************************************************
 */

#ifndef _TINY_STM_LINKED_LIST_QUEUE_H_
#define _TINY_STM_LINKED_LIST_QUEUE_H_

#include <atomic>
#include <stdexcept>

#include "stms/TinySTM.hpp"

/**
 * <h1> A Linked List queue using Tiny STM</h1>

 */
template<typename T>
class TinySTMLinkedListQueue : public tinystm::tmbase {

private:
    struct Node : tinystm::tmbase {
        T* item;
        tinystm::tmtype<Node*> next {nullptr};
        Node(T* userItem) : item{userItem} { }
    };

    tinystm::tmtype<Node*>  head {nullptr};
    tinystm::tmtype<Node*>  tail {nullptr};


public:
    TinySTMLinkedListQueue(unsigned int maxThreads=0) {
        tinystm::updateTx<bool>([this] () {
            Node* sentinelNode = tinystm::tmNew<Node>(nullptr);
            head = sentinelNode;
            tail = sentinelNode;
            return true;
        });
    }


    ~TinySTMLinkedListQueue() {
        while (dequeue() != nullptr); // Drain the queue
        tinystm::updateTx<bool>([this] () {
            Node* lhead = head;
            tinystm::tmDelete(lhead);
            return true;
        });
    }


    static std::string className() { return "TinySTM-LinkedListQueue"; }


    /*
     * Progress Condition: blocking
     * Always returns true
     */
    bool enqueue(T* item, const int tid=0) {
        if (item == nullptr) throw std::invalid_argument("item can not be nullptr");
        return tinystm::updateTx<bool>([this,item] () -> bool {
            Node* newNode = tinystm::tmNew<Node>(item);
            tail->next = newNode;
            tail = newNode;
            return true;
        });
    }


    /*
     * Progress Condition: blocking
     */
    T* dequeue(const int tid=0) {
        return tinystm::updateTx<T*>([this] () -> T* {
            Node* lhead = head;
            if (lhead == tail) return nullptr;
            head = lhead->next;
            tinystm::tmDelete(lhead);
            return head->item;
        });
    }
};

#endif /* _TINY_STM_LINKED_LIST_QUEUE_H_ */


================================================
FILE: datastructures/queues/TurnQueue.hpp
================================================
/******************************************************************************
 * Copyright (c) 2014-2016, Pedro Ramalhete, Andreia Correia
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of Concurrency Freaks nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************************************************************
 */

#ifndef _TURN_QUEUE_HP_H_
#define _TURN_QUEUE_HP_H_

#include <atomic>
#include <stdexcept>
#include "HazardPointers.hpp"


/**
 * <h1> Turn Queue </h1>
 *
 * A concurrent wait-free queue that is Multi-Producer-Multi-Consumer and does
 * its own wait-free memory reclamation.
 * Based on the paper "A Wait-Free Queue with Wait-Free Memory Reclamation"
 * https://github.com/pramalhe/ConcurrencyFreaks/tree/master/papers/crturnqueue-2016.pdf
 *
 * <p>
 * Enqueue algorithm: CR Turn enqueue
 * Dequeue algorithm: CR Turn dequeue
 * Consistency: Linearizable
 * enqueue() progress: wait-free bounded O(N_threads)
 * dequeue() progress: wait-free bounded O(N_threads)
 * Memory Reclamation: Hazard Pointers (wait-free)
 *
 * <p>
 * The paper on Hazard Pointers is named "Hazard Pointers: Safe Memory
 * Reclamation for Lock-Free objects" and it is available here:
 * http://web.cecs.pdx.edu/~walpole/class/cs510/papers/11.pdf
 *
 * @author Andreia Correia
 * @author Pedro Ramalhete
 */
template<typename T>
class TurnQueue {

private:
    struct Node {
        T* item;
        const int enqTid;
        std::atomic<int> deqTid;
        std::atomic<Node*> next;

        Node(T* item, int tid) : item{item}, enqTid{tid}, deqTid{IDX_NONE}, next{nullptr} { }

        bool casDeqTid(int cmp, int val) {
     	    return deqTid.compare_exchange_strong(cmp, val);
        }
    };

    static const int IDX_NONE = -1;
    static const int MAX_THREADS = 128;
    const int maxThreads;

    // Pointers to head and tail of the list
    alignas(128) std::atomic<Node*> head;
    alignas(128) std::atomic<Node*> tail;
    // Enqueue requests
    alignas(128) std::atomic<Node*> enqueuers[MAX_THREADS];
    // Dequeue requests
    alignas(128) std::atomic<Node*> deqself[MAX_THREADS];
    alignas(128) std::atomic<Node*> deqhelp[MAX_THREADS];


    HazardPointers<Node> hp {3, maxThreads}; // We need three hazard pointers
    const int kHpTail = 0;
    const int kHpHead = 0;
    const int kHpNext = 1;
    const int kHpDeq = 2;

    Node* sentinelNode = new Node(nullptr, 0);


    /**
     * Called only from dequeue()
     *
     * Search for the next request to dequeue and assign it to lnext.deqTid
     * It is only a request to dequeue if deqself[i] equals deqhelp[i].
     */
    inline int searchNext(Node* lhead, Node* lnext) {
        const int turn = lhead->deqTid.load();
        for (int idx=turn+1; idx < turn+maxThreads+1; idx++) {
            const int idDeq = idx%maxThreads;
            if (deqself[idDeq].load() != deqhelp[idDeq].load()) continue;
            if (lnext->deqTid.load() == IDX_NONE) lnext->casDeqTid(IDX_NONE, idDeq);
            break;
        }
        return lnext->deqTid.load();
    }


    /**
     * Called only from dequeue()
     *
     * If the ldeqTid is not our own, we must use an HP to protect against
     * deqhelp[ldeqTid] being retired-deleted-newed-reenqueued.
     */
    inline void casDeqAndHead(Node* lhead, Node* lnext, const int tid) {
        const int ldeqTid = lnext->deqTid.load();
        if (ldeqTid == tid) {
            deqhelp[ldeqTid].store(lnext, std::memory_order_release);
        } else {
            Node* ldeqhelp = hp.protectPtr(kHpDeq, deqhelp[ldeqTid].load(), tid);
            if (ldeqhelp != lnext && lhead == head.load()) {
                deqhelp[ldeqTid].compare_exchange_strong(ldeqhelp, lnext); // Assign next to request
            }
        }
        head.compare_exchange_strong(lhead, lnext);
    }


    /**
     * Called only from dequeue()
     *
     * Giveup procedure, for when there are no nodes left to dequeue
     */
    inline void giveUp(Node* myReq, const int tid) {
        Node* lhead = head.load();
        if (deqhelp[tid].load() != myReq || lhead == tail.load()) return;
        hp.protectPtr(kHpHead, lhead, tid);
        if (lhead != head.load()) return;
        Node* lnext = hp.protectPtr(kHpNext, lhead->next.load(), tid);
        if (lhead != head.load()) return;
        if (searchNext(lhead, lnext) == IDX_NONE) lnext->casDeqTid(IDX_NONE, tid);
        casDeqAndHead(lhead, lnext, tid);
    }

public:
    TurnQueue(int maxThreads=MAX_THREADS) : maxThreads(maxThreads) {
        head.store(sentinelNode, std::memory_order_relaxed);
        tail.store(sentinelNode, std::memory_order_relaxed);
        for (int i = 0; i < maxThreads; i++) {
            enqueuers[i].store(nullptr, std::memory_order_relaxed);
            // deqself[i] != deqhelp[i] means that isRequest=false
            deqself[i].store(new Node(nullptr, 0), std::memory_order_relaxed);
            deqhelp[i].store(new Node(nullptr, 0), std::memory_order_relaxed);
        }
    }


    ~TurnQueue() {
        delete sentinelNode;
        while (dequeue(0) != nullptr); // Drain the queue
        for (int i=0; i < maxThreads; i++) delete deqself[i].load();
        for (int i=0; i < maxThreads; i++) delete deqhelp[i].load();
    }


    static std::string className() { return "TurnQueue"; }


    /**
     * Steps when uncontended:
     * 1. Add node to enqueuers[]
     * 2. Insert node in tail.next using a CAS
     * 3. Advance tail to tail.next
     * 4. Remove node from enqueuers[]
     *
     * @param tid The tid must be a UNIQUE index for each thread, in the range 0 to maxThreads-1
     */
    void enqueue(T* item, const int tid) {
        if (item == nullptr) throw std::invalid_argument("item can not be nullptr");
        Node* myNode = new Node(item,tid);
        enqueuers[tid].store(myNode);
        for (int i = 0; i < maxThreads; i++) {
            if (enqueuers[tid].load() == nullptr) {
                hp.clear(tid);
                return; // Some thread did all the steps
            }
            Node* ltail = hp.protectPtr(kHpTail, tail.load(), tid);
            if (ltail != tail.load()) continue; // If the tail advanced maxThreads times, then my node has been enqueued
            if (enqueuers[ltail->enqTid].load() == ltail) {  // Help a thread do step 4
                Node* tmp = ltail;
                enqueuers[ltail->enqTid].compare_exchange_strong(tmp, nullptr);
            }
            for (int j = 1; j < maxThreads+1; j++) {         // Help a thread do step 2
                Node* nodeToHelp = enqueuers[(j + ltail->enqTid) % maxThreads].load();
                if (nodeToHelp == nullptr) continue;
                Node* nodenull = nullptr;
                ltail->next.compare_exchange_strong(nodenull, nodeToHelp);
                break;
            }
            Node* lnext = ltail->next.load();
     	    if (lnext != nullptr) tail.compare_exchange_strong(ltail, lnext); // Help a thread do step 3
        }
        enqueuers[tid].store(nullptr, std::memory_order_release); // Do step 4, just in case it's not done
        hp.clear(tid);
    }


    /**
     * Steps when uncontended:
     * 1. Publish request to dequeue in dequeuers[tid];
     * 2. CAS node->deqTid from IDX_START to tid;
     * 3. Set dequeuers[tid] to the newly owned node;
     * 4. Advance the head with casHead();
     *
     * We must protect either head or tail with HP before doing the check for
     * empty queue, otherwise we may get into retired-deleted-newed-reenqueued.
     *
     * @param tid: The tid must be a UNIQUE index for each thread, in the range 0 to maxThreads-1
     */
    T* dequeue(const int tid) {
        Node* prReq = deqself[tid].load();     // Previous request
        Node* myReq = deqhelp[tid].load();
        deqself[tid].store(myReq);             // Step 1
        for (int i=0; i < maxThreads; i++) {
            if (deqhelp[tid].load() != myReq) break; // No need for HP
            Node* lhead = hp.protectPtr(kHpHead, head.load(), tid);
            if (lhead != head.load()) continue;
            if (lhead == tail.load()) {        // Give up
                deqself[tid].store(prReq);     // Rollback request to dequeue
                giveUp(myReq, tid);
                if (deqhelp[tid].load() != myReq) {
                    deqself[tid].store(myReq, std::memory_order_relaxed);
                    break;
                }
                hp.clear(tid);
                return nullptr;
            }
            Node* lnext = hp.protectPtr(kHpNext, lhead->next.load(), tid);
            if (lhead != head.load()) continue;
 		    if (searchNext(lhead, lnext) != IDX_NONE) casDeqAndHead(lhead, lnext, tid);
        }
        Node* myNode = deqhelp[tid].load();
        Node* lhead = hp.protectPtr(kHpHead, head.load(), tid);     // Do step 4 if needed
        if (lhead == head.load() && myNode == lhead->next.load()) head.compare_exchange_strong(lhead, myNode);
        hp.clear(tid);
        hp.retire(prReq, tid);
        return myNode->item;
    }

};

#endif /* _CR_TURN_QUEUE_HP_H_ */


================================================
FILE: datastructures/sequential/HashSet.hpp
================================================
/******************************************************************************
 * Copyright (c) 2014-2018, Pedro Ramalhete, Andreia Correia
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of Concurrency Freaks nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.

 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************************************************************
 */

#ifndef _UC_HASH_SET_H_
#define _UC_HASH_SET_H_

#include <iostream>
#include <functional>
#include <unordered_set>

// TODO: change CKey* to CKey&

// This is a wrapper to std::set, which should be a Red-Black tree
template<typename CKey>
class HashSet {

private:
    std::unordered_set<CKey> set;

public:

    static std::string className() { return "HashSet"; }


    bool add(CKey key) {
        if (set.find(key) == set.end()) {
            set.insert(key); // TODO: can we improve this so we don't have to lookup twice?
            return true;
        }
        return false;
    }

    bool remove(CKey key) {
        auto iter = set.find(key);
        if (iter == set.end()) return false;
        set.erase(iter);
        return true;
    }

    bool contains(CKey key) {
        if (set.find(key) == set.end()) return false;
        return true;  // TODO: optimize this
    }

    bool iterateAll(std::function<bool(CKey*)> itfun) {
		for (auto it = set.begin(); it != set.end(); ++it) {
			CKey key = *it;
			if (!itfun(&key)) return false;
		}
		return true;
    }

};

#endif /* _UC_HASH_SET_H_ */


================================================
FILE: datastructures/sequential/LinkedListQueue.hpp
================================================
/******************************************************************************
 * Copyright (c) 2014-2017, Pedro Ramalhete, Andreia Correia
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of Concurrency Freaks nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************************************************************
 */

#ifndef _SEQUENTIAL_LINKED_LIST_QUEUE_H_
#define _SEQUENTIAL_LINKED_LIST_QUEUE_H_

/**
 * <h1> A sequential implementation of Linked List Queue </h1>
 *
 * This is meant to be used by the Universal Constructs
 *
 * @author Pedro Ramalhete
 * @author Andreia Correia
 */
template<typename T>
class LinkedListQueue {

private:
    struct Node {
        T* item;
        Node* next {nullptr};
        Node(T* userItem) : item{userItem} { }
    };

    Node*  head {nullptr};
    Node*  tail {nullptr};


public:
    LinkedListQueue(unsigned int maxThreads=0) {
        Node* sentinelNode = new Node(nullptr);
        head = sentinelNode;
        tail = sentinelNode;
    }


    // Universal Constructs need a copy constructor on the underlying data structure
    LinkedListQueue(const LinkedListQueue& other) {
        head = new Node(nullptr);
        Node* node = head;
        Node* onode = other.head->next;
        while (onode != nullptr) {
            node->next = new Node(onode->item);
            node = node->next;
            onode = onode->next;
        }
        tail = node;
    }


    ~LinkedListQueue() {
        while (dequeue(0) != nullptr); // Drain the queue
        Node* lhead = head;
        delete lhead;
    }


    static std::string className() { return "LinkedListQueue"; }


    bool enqueue(T* item, const int tid=0) {
        if (item == nullptr) return false;
        Node* newNode = new Node(item);
        tail->next = newNode;
        tail = newNode;
        return true;
    }


    T* dequeue(const int tid=0) {
        Node* lhead = head;
        if (lhead == tail) return nullptr;
        head = lhead->next;
        delete lhead;
        return head->item;
    }
};

#endif /* _SEQUENTIAL_LINKED_LIST_QUEUE_H_ */


================================================
FILE: datastructures/sequential/LinkedListSet.hpp
================================================
#ifndef _SEQUENTIAL_LINKED_LIST_SET_H_
#define _SEQUENTIAL_LINKED_LIST_SET_H_

#include <string>

/**
 * <h1> A sequential implementation of La inked List Set </h1>
 *
 * This is meant to be used by the Universal Constructs
 *
 */
template<typename K>
class LinkedListSet {

private:

    struct Node {
        K     key;
        Node* next{nullptr};
        Node(const K& key) : key{key}, next{nullptr} { }
        Node(){ }
    };

    Node*  head {nullptr};
    Node*  tail {nullptr};


public:
    LinkedListSet() {
		Node* lhead = new Node();
		Node* ltail = new Node();
		head = lhead;
		head->next = ltail;
		tail = ltail;

    }


    // Universal Constructs need a copy constructor on the underlying data structure
    LinkedListSet(const LinkedListSet& other) {
        head = new Node();
        Node* node = head;
        Node* onode = other.head->next;
        while (onode != other.tail) {
            node->next = new Node(onode->key);
            node = node->next;
            onode = onode->next;
        }
        tail = new Node();
        node->next = tail;
    }


    ~LinkedListSet() {
		// Delete all the nodes in the list
		Node* prev = head;
		Node* node = prev->next;
		while (node != tail) {
			delete prev;
			prev = node;
			node = node->next;
		}
		delete prev;
		delete tail;
    }


    static std::string className() { return "LinkedListSet"; }


    /*
     * Adds a node with a key, returns false if the key is already in the set
     */
    bool add(const K& key) {
        Node *prev, *node;
        find(key, prev, node);
        bool retval = !(node != tail && key == node->key);
        if (!retval) return retval;
        Node* newNode = new Node(key);
        prev->next = newNode;
        newNode->next = node;
        return retval;
    }


    /*
     * Removes a node with an key, returns false if the key is not in the set
     */
    bool remove(const K& key) {
        Node *prev, *node;
        find(key, prev, node);
        bool retval = (node != tail && key == node->key);
        if (!retval) return retval;
        prev->next = node->next;
        delete node;
        return retval;
    }


    /*
     * Returns true if it finds a node with a matching key
     */
    bool contains(const K& key) {
        Node *prev, *node;
        find(key, prev, node);
        return (node != tail && key == node->key);
    }

    void find(const K& key, Node*& prev, Node*& node) {
        for (prev = head; (node = prev->next) != tail; prev = node){
            if ( !(node->key < key) ) break;
        }
    }

    // Used only for benchmarks
    bool addAll(K** keys, const int size) {
        bool retval = false;
        for (int i = 0; i < size; i++) {
            Node *prev, *node;
            find(*keys[i], prev, node);
            retval = !(node != tail && *keys[i] == node->key);
            if (retval) {
                Node* newNode = new Node(*keys[i]);
                prev->next = newNode;
                newNode->next = node;
            }
        }
        return true;
    }
};

#endif /* _SEQUENTIAL_LINKED_LIST_SET_H_ */


================================================
FILE: datastructures/sequential/RedBlackBST.hpp
================================================
#ifndef _RED_BLACK_BST_H_
#define _RED_BLACK_BST_H_

#include <cassert>
#include <stdexcept>
#include <algorithm>

// Single-threaded implementation of a Red-Black Tree Map
//http://algs4.cs.princeton.edu/code/edu/princeton/cs/algs4/RedBlackBST.java
template<typename K, typename V>
class RedBlackBST {


    struct Node {
        K* key;
        V* val;
        Node* left {nullptr};
        Node* right {nullptr};
        bool color;    // color of parent link
        int size;      // subtree count
        Node(K* key, V* val, bool color, int size) : key{key}, val{val}, color{color}, size{size} {}
    };

    Node *root {nullptr};   // root of the BST


    static const bool RED   = true;
    static const bool BLACK = false;

public:
    /**
     * Initializes an empty symbol table.
     */
    RedBlackBST(unsigned int maxThreads=128) { }

    /***************************************************************************
     *  Node helper methods.
     ***************************************************************************/
    // is node x red; false if x is null ?
    bool isRed(Node* x) {
        if (x == nullptr) return false;
        return x->color == RED;
    }

    // number of node in subtree rooted at x; 0 if x is null
    int size(Node* x) {
        if (x == nullptr) return 0;
        return x->size;
    }


    /**
     * Returns the number of key-value pairs in this symbol table.
     * @return the number of key-value pairs in this symbol table
     */
    int size() {
        return size(root);
    }

    /**
     * Is this symbol table empty?
     * @return {@code true} if this symbol table is empty and {@code false} otherwise
     */
    bool isEmpty() {
        return root == nullptr;
    }


    /***************************************************************************
     *  Standard BST search->
     ***************************************************************************/

    /**
     * Returns the value associated with the given key.
     * @param key the key
     * @return the value associated with the given key if the key is in the symbol table
     *     and {@code null} if the key is not in the symbol table
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    V* get(K* key) {
        if (key == nullptr) throw std::invalid_argument("item can not be nullptr");
        return get(root, key);
    }

    // value associated with the given key in subtree rooted at x; null if no such key
    V* get(Node* x, K* key) {
        while (x != nullptr) {
            if      (*key < *x->key) x = x->left;
            else if (*x->key < *key) x = x->right;
            else              return x->val;
        }
        return nullptr;
    }

    /**
     * Does this symbol table contain the given key?
     * @param key the key
     * @return {@code true} if this symbol table contains {@code key} and
     *     {@code false} otherwise
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    bool contains(K* key) {
        return get(key) != nullptr;
    }

    /***************************************************************************
     *  Red-black tree insertion.
     ***************************************************************************/

    /**
     * Inserts the specified key-value pair into the symbol table, overwriting the old
     * value with the new value if the symbol table already contains the specified key.
     * Deletes the specified key (and its associated value) from this symbol table
     * if the specified value is {@code null}.
     *
     * @param key the key
     * @param val the value
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    void put(K* key, V* val) {
        if (key == nullptr) throw std::invalid_argument("item can not be nullptr");
        if (val == nullptr) {
            deleteKey(key);
            return;
        }

        root = put(root, key, val);
        root->color = BLACK;
        // assert check();
    }

    // insert the key-value pair in the subtree rooted at h
    Node* put(Node* h, K* key, V* val) {
        if (h == nullptr) return new Node(key, val, RED, 1);
        if      (*key < *h->key) h->left  = put(h->left,  key, val);
        else if (*h->key < *key) h->right = put(h->right, key, val);
        else              h->val   = val;

        // fix-up any right-leaning links
        if (isRed(h->right) && !isRed(h->left))      h = rotateLeft(h);
        if (isRed(h->left)  &&  isRed(h->left->left)) h = rotateRight(h);
        if (isRed(h->left)  &&  isRed(h->right))     flipColors(h);
        h->size = size(h->left) + size(h->right) + 1;

        return h;
    }

    /***************************************************************************
     *  Red-black tree deletion.
     ***************************************************************************/

    /**
     * Removes the smallest key and associated value from the symbol table.
     * @throws NoSuchElementException if the symbol table is empty
     */
    void deleteMin() {
        if (isEmpty()) throw std::invalid_argument("item can not be nullptr");

        // if both children of root are black, set root to red
        if (!isRed(root->left) && !isRed(root->right))
            root->color = RED;

        root = deleteMin(root);
        if (!isEmpty()) root->color = BLACK;
        // assert check();
    }

    // delete the key-value pair with the minimum key rooted at h
    Node* deleteMin(Node* h) {
        if (h->left == nullptr)
            return nullptr;

        if (!isRed(h->left) && !isRed(h->left->left))
            h = moveRedLeft(h);

        h->left = deleteMin(h->left);
        return balance(h);
    }


    /**
     * Removes the largest key and associated value from the symbol table.
     * @throws NoSuchElementException if the symbol table is empty
     */
    void deleteMax() {
        if (isEmpty()) throw std::invalid_argument("item can not be nullptr");

        // if both children of root are black, set root to red
        if (!isRed(root->left) && !isRed(root->right))
            root->color = RED;

        root = deleteMax(root);
        if (!isEmpty()) root->color = BLACK;
        // assert check();
    }

    // delete the key-value pair with the maximum key rooted at h
    Node* deleteMax(Node* h) {
        if (isRed(h->left))
            h = rotateRight(h);

        if (h->right == nullptr)
            return nullptr;

        if (!isRed(h->right) && !isRed(h->right->left))
            h = moveRedRight(h);

        h->right = deleteMax(h->right);

        return balance(h);
    }

    /**
     * Removes the specified key and its associated value from this symbol table
     * (if the key is in this symbol table).
     *
     * @param  key the key
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    void deleteKey(K* key) {
        if (key == nullptr) throw std::invalid_argument("item can not be nullptr");
        if (!contains(key)) return;

        // if both children of root are black, set root to red
        if (!isRed(root->left) && !isRed(root->right))
            root->color = RED;

        root = deleteKey(root, key);
        if (!isEmpty()) root->color = BLACK;
        // assert check();
    }

    // delete the key-value pair with the given key rooted at h
    Node* deleteKey(Node* h, K* key) {
        // assert get(h, key) != null;

        if (*key < *h->key)  {
            if (!isRed(h->left) && !isRed(h->left->left))
                h = moveRedLeft(h);
            h->left = deleteKey(h->left, key);
        }
        else {
            if (isRed(h->left))
                h = rotateRight(h);
            if (*key == *h->key && (h->right == nullptr))
                return nullptr;
            if (!isRed(h->right) && !isRed(h->right->left))
                h = moveRedRight(h);
            if (*key == *h->key) {
                Node* x = min(h->right);
                h->key = x->key;
                h->val = x->val;
                // h->val = get(h->right, min(h->right).key);
                // h->key = min(h->right).key;
                h->right = deleteMin(h->right);
            }
            else h->right = deleteKey(h->right, key);
        }
        return balance(h);
    }

    /***************************************************************************
     *  Red-black tree helper functions.
     ***************************************************************************/

    // make a left-leaning link lean to the right
    Node* rotateRight(Node* h) {
        // assert (h != null) && isRed(h->left);
        Node* x = h->left;
        h->left = x->right;
        x->right = h;
        x->color = x->right->color;
        x->right->color = RED;
        x->size = h->size;
        h->size = size(h->left) + size(h->right) + 1;
        return x;
    }

    // make a right-leaning link lean to the left
    Node* rotateLeft(Node* h) {
        // assert (h != null) && isRed(h->right);
        Node* x = h->right;
        h->right = x->left;
        x->left = h;
        x->color = x->left->color;
        x->left->color = RED;
        x->size = h->size;
        h->size = size(h->left) + size(h->right) + 1;
        return x;
    }

    // flip the colors of a node and its two children
    void flipColors(Node* h) {
        // h must have opposite color of its two children
        // assert (h != null) && (h->left != null) && (h->right != null);
        // assert (!isRed(h) &&  isRed(h->left) &&  isRed(h->right))
        //    || (isRed(h)  && !isRed(h->left) && !isRed(h->right));
        h->color = !h->color;
        h->left->color = !h->left->color;
        h->right->color = !h->right->color;
    }

    // Assuming that h is red and both h->left and h->left.left
    // are black, make h->left or one of its children red.
    Node* moveRedLeft(Node* h) {
        // assert (h != null);
        // assert isRed(h) && !isRed(h->left) && !isRed(h->left.left);

        flipColors(h);
        if (isRed(h->right->left)) {
            h->right = rotateRight(h->right);
            h = rotateLeft(h);
            flipColors(h);
        }
        return h;
    }

    // Assuming that h is red and both h->right and h->right.left
    // are black, make h->right or one of its children red.
    Node* moveRedRight(Node* h) {
        // assert (h != null);
        // assert isRed(h) && !isRed(h->right) && !isRed(h->right.left);
        flipColors(h);
        if (isRed(h->left->left)) {
            h = rotateRight(h);
            flipColors(h);
        }
        return h;
    }

    // restore red-black tree invariant
    Node* balance(Node* h) {
        // assert (h != null);

        if (isRed(h->right))                      h = rotateLeft(h);
        if (isRed(h->left) && isRed(h->left->left)) h = rotateRight(h);
        if (isRed(h->left) && isRed(h->right))     flipColors(h);

        h->size = size(h->left) + size(h->right) + 1;
        return h;
    }


    /***************************************************************************
     *  Utility functions.
     ***************************************************************************/

    /**
     * Returns the height of the BST (for debugging).
     * @return the height of the BST (a 1-node tree has height 0)
     */
    int height() {
        return height(root);
    }
    int height(Node* x) {
        if (x == nullptr) return -1;
        return 1 + std::max(height(x->left), height(x->right));
    }

    /***************************************************************************
     *  Ordered symbol table methods.
     ***************************************************************************/

    /**
     * Returns the smallest key in the symbol table.
     * @return the smallest key in the symbol table
     * @throws NoSuchElementException if the symbol table is empty
     */
    K* min() {
        if (isEmpty()) throw std::invalid_argument("item can not be nullptr");
        return min(root).key;
    }

    // the smallest key in subtree rooted at x; null if no such key
    Node* min(Node* x) {
        // assert x != null;
        if (x->left == nullptr) return x;
        else                return min(x->left);
    }

    /**
     * Returns the largest key in the symbol table.
     * @return the largest key in the symbol table
     * @throws NoSuchElementException if the symbol table is empty
     */
    K* max() {
        if (isEmpty()) throw std::invalid_argument("item can not be nullptr");
        return max(root).key;
    }

    // the largest key in the subtree rooted at x; null if no such key
    Node* max(Node* x) {
        // assert x != null;
        if (x->right == nullptr) return x;
        else                 return max(x->right);
    }


    /**
     * Returns the largest key in the symbol table less than or equal to {@code key}.
     * @param key the key
     * @return the largest key in the symbol table less than or equal to {@code key}
     * @throws NoSuchElementException if there is no such key
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    K* floor(K* key) {
        if (key == nullptr) throw std::invalid_argument("item can not be nullptr");
        if (isEmpty()) throw std::invalid_argument("item can not be nullptr");
        Node* x = floor(root, key);
        if (x == nullptr) return nullptr;
        else           return x->key;
    }

    // the largest key in the subtree rooted at x less than or equal to the given key
    Node* floor(Node* x, K* key) {
        if (x == nullptr) return nullptr;
        if (*key == *x->key) return x;
        if (*key < *x->key)  return floor(x->left, key);
        Node* t = floor(x->right, key);
        if (t != nullptr) return t;
        else           return x;
    }

    /**
     * Returns the smallest key in the symbol table greater than or equal to {@code key}.
     * @param key the key
     * @return the smallest key in the symbol table greater than or equal to {@code key}
     * @throws NoSuchElementException if there is no such key
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    K* ceiling(K* key) {
        if (key == nullptr) throw std::invalid_argument("item can not be nullptr");
        if (isEmpty()) throw std::invalid_argument("item can not be nullptr");
        Node* x = ceiling(root, key);
        if (x == nullptr) return nullptr;
        else           return x->key;
    }

    // the smallest key in the subtree rooted at x greater than or equal to the given key
    Node* ceiling(Node* x, K* key) {
        if (x == nullptr) return nullptr;
        int cmp = key.compareTo(x->key);
        if (*key == *x->key) return x;
        if (*x->key < *key)  return ceiling(x->right, key);
        Node* t = ceiling(x->left, key);
        if (t != nullptr) return t;
        else           return x;
    }

    /**
     * Return the kth smallest key in the symbol table.
     * @param k the order statistic
     * @return the {@code k}th smallest key in the symbol table
     * @throws IllegalArgumentException unless {@code k} is between 0 and
     *     <em>n</em>1
     */
    K* select(int k) {
        if (k < 0 || k >= size()) {
            throw std::invalid_argument("item can not be nullptr");
        }
        Node x = select(root, k);
        return x->key;
    }

    // the key of rank k in the subtree rooted at x
    Node* select(Node* x, int k) {
        // assert x != null;
        // assert k >= 0 && k < size(x);
        int t = size(x->left);
        if      (t > k) return select(x->left,  k);
        else if (t < k) return select(x->right, k-t-1);
        else            return x;
    }

    /**
     * Return the number of keys in the symbol table strictly less than {@code key}.
     * @param key the key
     * @return the number of keys in the symbol table strictly less than {@code key}
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    int rank(K* key) {
        if (key == nullptr) throw std::invalid_argument("item can not be nullptr");
        return rank(key, root);
    }

    // number of keys less than key in the subtree rooted at x
    int rank(K* key, Node* x) {
        if (x == nullptr) return 0;
        if      (*key < *x->key) return rank(key, x->left);
        else if (*x->key < *key) return 1 + size(x->left) + rank(key, x->right);
        else              return size(x->left);
    }

    /***************************************************************************
     *  Range count and range search->
     ***************************************************************************/


    /**
     * Returns the number of keys in the symbol table in the given range.
     *
     * @param  lo minimum endpoint
     * @param  hi maximum endpoint
     * @return the number of keys in the sybol table between {@code lo}
     *    (inclusive) and {@code hi} (inclusive)
     * @throws IllegalArgumentException if either {@code lo} or {@code hi}
     *    is {@code null}
     */
    int size(K* lo, K* hi) {
        if (lo == nullptr) throw std::invalid_argument("item can not be nullptr");
        if (hi == nullptr) throw std::invalid_argument("item can not be nullptr");

        if (*lo < *hi) return 0;
        if (contains(hi)) return rank(hi) - rank(lo) + 1;
        else              return rank(hi) - rank(lo);
    }


    /***************************************************************************
     *  Check integrity of red-black tree data structure.
     ***************************************************************************/
    bool check() {
        if (!isBST())            std::cout << "Not in symmetric order\n";
        if (!isSizeConsistent()) std::cout << "Subtree counts not consistent\n";
        //if (!isRankConsistent()) std::cout << "Ranks not consistent\n";
        if (!is23())             std::cout << "Not a 2-3 tree\n";
        if (!isBalanced())       std::cout << "Not balanced\n";
        return isBST() && isSizeConsistent() && is23() && isBalanced();
    }

    // does this binary tree satisfy symmetric order?
    // Note: this test also ensures that data structure is a binary tree since order is strict
    bool isBST() {
        return isBST(root, nullptr, nullptr);
    }

    // is the tree rooted at x a BST with all keys strictly between min and max
    // (if min or max is null, treat as empty constraint)
    // Credit: Bob Dondero's elegant solution
    bool isBST(Node* x, K* min, K* max) {
        if (x == nullptr) return true;
        if (min != nullptr && x->key.compareTo(min) <= 0) return false;
        if (max != nullptr && x->key.compareTo(max) >= 0) return false;
        return isBST(x->left, min, x->key) && isBST(x->right, x->key, max);
    }

    // are the size fields correct?
    bool isSizeConsistent() { return isSizeConsistent(root); }
    bool isSizeConsistent(Node* x) {
        if (x == nullptr) return true;
        if (x->size != size(x->left) + size(x->right) + 1) return false;
        return isSizeConsistent(x->left) && isSizeConsistent(x->right);
    }

    /*
    // check that ranks are consistent
    bool isRankConsistent() {
        for (int i = 0; i < size(); i++)
            if (i != rank(select(i))) return false;
        for (K* key : keys())
            if (key.compareTo(select(rank(key))) != 0) return false;
        return true;
    }
    */

    // Does the tree have no red right links, and at most one (left)
    // red links in a row on any path?
    bool is23() { return is23(root); }
    bool is23(Node* x) {
        if (x == nullptr) return true;
        if (isRed(x->right)) return false;
        if (x != root && isRed(x) && isRed(x->left))
            return false;
        return is23(x->left) && is23(x->right);
    }

    // do all paths from root to leaf have same number of black edges?
    bool isBalanced() {
        int black = 0;     // number of black links on path from root to min
        Node x = root;
        while (x != nullptr) {
            if (!isRed(x)) black++;
            x = x->left;
        }
        return isBalanced(root, black);
    }

    // does every path from the root to a leaf have the given number of black links?
    bool isBalanced(Node* x, int black) {
        if (x == nullptr) return black == 0;
        if (!isRed(x)) black--;
        return isBalanced(x->left, black) && isBalanced(x->right, black);
    }


    // Set methods
    bool add(K* key, const int tid) {
        if (contains(key)) return false;
        put(key,key);
        return true;
    }

    bool remove(K* key, const int tid) {
        if (!contains(key)) return false;
        deleteKey(key);
        return true;
    }

    inline bool contains(K* key, const int tid) {
        return contains(key);
    }

    std::string className() { return "RedBlackBST"; }

};

#endif   // _RED_BLACK_BST_H_


================================================
FILE: datastructures/sequential/SortedArraySet.hpp
================================================
/******************************************************************************
 * Copyright (c) 2014-2016, Pedro Ramalhete, Andreia Correia
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of Concurrency Freaks nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.

 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************************************************************
 */

#ifndef _SORTEDARRAYSET_H_
#define _SORTEDARRAYSET_H_

#include <iostream>

// TODO: Test this for correctness

/**
 * This is storing the pointers to the T instances, not the actual T instances.
 */
template<typename T>
class SortedArraySet {

private:
    unsigned max_size = 32;
    T** vec;  // TODO: change this to T if we change the API from T* to T&
    unsigned size = 0;
    static const int NOT_FOUND = 0;
    //std::atomic<bool> flag {false}; // For de debugging

    int lookup(T* key) {
        // Cover the special case of an empty array
        if (size== 0) return NOT_FOUND;
        int minPos = 0;
        int maxPos = size-1;
        //std::cout << "vec[0] = " << vec[0] << "\n";
        // Special cases for first and last items
        if (*key < *(vec[0])) return NOT_FOUND;
        if (*key == *(vec[0])) return 0;
        if (*key == *(vec[maxPos])) return maxPos;
        if (*(vec[maxPos]) < *key) return maxPos+1;
        while (true) {
            int pos = (maxPos-minPos)/2 + minPos;

            if (*key < *(vec[pos])) {
                maxPos = pos;
            } else if (*key == *(vec[pos])) {
                return pos;
            } else {
                minPos = pos;
            }
            if (maxPos-minPos <= 1) {
                return maxPos;
            }
        }
    }


public:
    SortedArraySet() {
        vec = new T*[max_size];
    }

    ~SortedArraySet() {
        delete[] vec;
    }

    // We need a copy constructor to be able to use it in CXMutation
    SortedArraySet(const SortedArraySet<T>& fromssv) {

        vec = new T*[fromssv.max_size];
        max_size = fromssv.max_size;
        size = fromssv.size;
        for(unsigned i=0;i<size;i++){
            vec[i]=fromssv.vec[i];
        }
    }

    static std::string className() { return "SortedArraySet"; }

    void erase(int index){
        //print();
        for(unsigned i=index;i<size-1;i++){
            vec[i] = vec[i+1];
        }
        size--;
        //print();
    }

    /**
     * When the curr.key is seen to be null it means we reached the tail node
     */
    bool remove(T* key) {
        //if (flag.load()) std::cout << "remove() ERRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRROOOOOOOOOOOOOOR\n";
        //flag.store(true);
        unsigned index = lookup(key);
        if (index == size) {
            //std::cout<<"remove key "<<key->seq<<" "<<key->tid<<" vex "<<index<<"\n";
            //flag.store(false);
            return false;
        }
        if (*key == *(vec[index])) {
            erase(index);
            //flag.store(false);
            return true;
        }
        //flag.store(false);
        return false;
    }


    bool add(T* key) {
        //if (flag.load()) std::cout << "add() ERRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRROOOOOOOOOOOOOOR\n";
        //flag.store(true);
        unsigned index = lookup(key);
        if (index != size && *key == *(vec[index])) {
            //std::cout<<"key "<<key->seq<<" "<<key->tid<<" vex "<<index<<" "<<vec[index]->seq<< " " << vec[index]->tid<<"\n";
            //assert(false);
            //flag.store(false);
            return false;
        }

        if(size+1==max_size){
            T** newvec = new T*[2*max_size];
            for(unsigned i=0;i<index;i++){
                newvec[i]=vec[i];
            }

            for(unsigned i=index;i<size;i++){
                newvec[i+1]=vec[i];
            }
            newvec[index]= key;
            delete[] vec;
            vec = newvec;
            max_size = 2*max_size;
        }else{
            if(index<size){
                for(unsigned i=size;i>=index+1;i--){
                    vec[i]=vec[i-1];
                }
            }
            vec[index] = key;
        }
        size++;
        //flag.store(false);
        return true;
    }


     bool contains(T* key) {
        //if (flag.load()) std::cout << "contains() ERRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRROOOOOOOOOOOOOOR\n";
        unsigned index = lookup(key);
        if (index == size) {
            return false;
        }
        return *key == *(vec[index]);
    }


    bool print() { // For debug purposes
        for(unsigned i=0;i<size;i++){
            std:: cout << vec[i] << ",";
        }
        std::cout << "\n";
        return true;
    }

};

#endif /* _SORTEDARRAYSET_H_ */


================================================
FILE: datastructures/sequential/SortedVectorSet.hpp
================================================
/******************************************************************************
 * Copyright (c) 2014-2016, Pedro Ramalhete, Andreia Correia
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of Concurrency Freaks nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.

 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************************************************************
 */

#ifndef _SORTED_VECTOR_SET_H_
#define _SORTED_VECTOR_SET_H_

#include <vector>
#include <iostream>

// TODO: Test this for correctness

/**
 * This is storing the pointers to the T instances, not the actual T instances.
 */
template<typename T>
class SortedVectorSet {

private:
    std::vector<T*> vec;  // TODO: change this to T if we change the API from T* to T&
    static const int NOT_FOUND = 0;
    //std::atomic<bool> flag {false}; // For de debugging

    int lookup(T* key) {
        // Cover the special case of an empty array
        if (vec.size()== 0) return NOT_FOUND;
        int minPos = 0;
        int maxPos = vec.size()-1;
        // Special cases for first and last items
        if (*key < *(vec[0])) return NOT_FOUND;
        if (*key == *(vec[0])) return 0;
        if (*key == *(vec[maxPos])) return maxPos;
        if (*(vec[maxPos]) < *key) return maxPos+1;
        while (true) {
            int pos = (maxPos-minPos)/2 + minPos;
            if (*key < *(vec[pos])) {
                maxPos = pos;
            } else if (*key == *(vec[pos])) {
                return pos;
            } else {
                minPos = pos;
            }
            if (maxPos-minPos <= 1) {
                return maxPos;
            }
        }
    }


public:
    SortedVectorSet() { }

    // We need a copy constructor to be able to use it in CXMutation
    SortedVectorSet(const SortedVectorSet<T>& from) {
        vec = from.vec; // Do a copy of the vector
    }

    static std::string className() { return "SortedVectorSet"; }

    /**
     * When the curr.key is seen to be null it means we reached the tail node
     */
    bool remove(T* key) {
        //if (flag.load()) std::cout << "remove() ERRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRROOOOOOOOOOOOOOR\n";
        //flag.store(true);
        unsigned index = lookup(key);
        if (index == vec.size()) {
            //std::cout<<"remove key "<<key->seq<<" "<<key->tid<<" vex "<<index<<"\n";
            //flag.store(false);
            return false;
        }
        if (*key == *(vec[index])) {
            vec.erase(vec.begin()+index);
            //flag.store(false);
            return true;
        }
        //flag.store(false);
        return false;
    }


    bool add(T* key) {
        //if (flag.load()) std::cout << "add() ERRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRROOOOOOOOOOOOOOR\n";
        //flag.store(true);
        unsigned index = lookup(key);
        if (index != vec.size() && *key == *(vec[index])) {
            //std::cout<<"key "<<key->seq<<" "<<key->tid<<" vex "<<index<<" "<<vec[index]->seq<< " " << vec[index]->tid<<"\n";
            //assert(false);
            //flag.store(false);
            return false;
        }
        vec.insert(vec.begin()+index, key);
        //flag.store(false);
        return true;
    }


     bool contains(T* key) {
        //if (flag.load()) std::cout << "contains() ERRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRRROOOOOOOOOOOOOOR\n";
        unsigned index = lookup(key);
        if (index == vec.size()) {
            return false;
        }
        return *key == *(vec[index]);
    }


    bool print() { // For debug purposes
        for (T* p : vec) std:: cout << p << ",";
        std::cout << "\n";
        return true;
    }
};

#endif /* _SORTED_VECTOR_SET_H_ */


================================================
FILE: datastructures/sequential/TreeSet.hpp
================================================
/******************************************************************************
 * Copyright (c) 2014-2016, Pedro Ramalhete, Andreia Correia
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of Concurrency Freaks nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.

 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************************************************************
 */

#ifndef _CX_TREE_SET_H_
#define _CX_TREE_SET_H_

#include <iostream>
#include <functional>
#include <set>

//#include "../datastructures/sequential/RedBlackBST.hpp"

// TODO: change CKey* to CKey&

// This is a wrapper to std::set, which should be a Red-Black tree
template<typename CKey>
class TreeSet {

private:
    std::set<CKey> set;
    // Use this instead if we want to have control over the Red-Black tree
    //RedBlackBST<CKey,CKey> set;

public:

    static std::string className() { return "TreeSet"; }


    bool add(CKey key) {
        if (set.find(key) == set.end()) {
            set.insert(key); // TODO: can we improve this so we don't have to lookup twice?
            return true;
        }
        return false;
    }

    bool remove(CKey key) {
        auto iter = set.find(key);
        if (iter == set.end()) return false;
        set.erase(iter);
        return true;
    }

    bool contains(CKey key) {
        if (set.find(key) == set.end()) return false;
        return true;  // TODO: optimize this
    }

    bool iterateAll(std::function<bool(CKey*)> itfunc) {
		for (auto it = set.begin(); it != set.end(); ++it) {
			CKey key = *it;
			if (!itfunc(&key)) return false;
		}
		return true;
    }
/*
    bool add(CKey* key) {
        return set.add(key, 0);
    }

    bool remove(CKey* key) {
        return set.remove(key, 0);
    }

    bool contains(CKey* key) {
        return set.contains(key, 0);
    }
*/
};

#endif /* _TREE_SET_H_ */


================================================
FILE: datastructures/treemaps/ESTMRedBlackTree.hpp
================================================
#ifndef _ESTM_RED_BLACK_BST_H_
#define _ESTM_RED_BLACK_BST_H_

#include <cassert>
#include <stdexcept>
#include <algorithm>

#include "stms/ESTM.hpp"

// Adapted from Java to C++ from the original at http://algs4.cs.princeton.edu/code/edu/princeton/cs/algs4/RedBlackBST.java
template<typename K, typename V>
class ESTMRedBlackTree {
    const int64_t COLOR_RED   = 0;
    const int64_t COLOR_BLACK = 1;

    struct Node {
        estm::tmtype<K>       key;
        estm::tmtype<V>       val;
        estm::tmtype<Node*>   left {nullptr};
        estm::tmtype<Node*>   right {nullptr};
        estm::tmtype<int64_t> color;    // color of parent link
        estm::tmtype<int64_t> size;     // subtree count
        Node(const K& key, const V& val, int64_t color, int64_t size) : key{key}, val{val}, color{color}, size{size} {}
    };

    estm::tmtype<Node*> root {nullptr};   // root of the BST

    inline void assignAndFreeIfNull(estm::tmtype<Node*>& z, Node* w) {
        Node* tofree = z;
        z = w;
        if (w == nullptr) estm::tmDelete(tofree);
    }

public:
    /**
     * Initializes an empty symbol table.
     */
    ESTMRedBlackTree(int maxThreads=0){ }

    ~ESTMRedBlackTree() {
        // The transaction log is not enough to delete everything if there are too many, so we delete 1000 per transaction
        for (int i = 0; i < 1000; i++) {
            estm::updateTx<bool>([&] () {
                if (root == nullptr) return true;
                deleteMin();
                return true;
            });
        }
    }

    /***************************************************************************
     *  Node helper methods.
     ***************************************************************************/
    // is node x red; false if x is null ?
    bool isRed(Node* x) {
        if (x == nullptr) return false;
        return x->color == COLOR_RED;
    }

    // number of node in subtree rooted at x; 0 if x is null
    int size(Node* x) {
        if (x == nullptr) return 0;
        return x->size;
    }


    /**
     * Returns the number of key-value pairs in this symbol table.
     * @return the number of key-value pairs in this symbol table
     */
    int size() {
        return size(root);
    }

    /**
     * Is this symbol table empty?
     * @return {@code true} if this symbol table is empty and {@code false} otherwise
     */
    bool isEmpty() {
        return root == nullptr;
    }


    /***************************************************************************
     *  Standard BST search->
     ***************************************************************************/

    /**
     * Returns the value associated with the given key.
     * @param key the key
     * @return the value associated with the given key if the key is in the symbol table
     *     and {@code null} if the key is not in the symbol table
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    bool innerGet(K& key, V& oldValue, const bool saveOldValue) {
        bool found = get(root, key);
        if (!found) return false;
        //if (saveOldValue) oldValue = *val; // Copy of V
        return true;
    }

    // value associated with the given key in subtree rooted at x; null if no such key
    bool get(Node* x, K& key) {
        while (x != nullptr) {
            if      (key < x->key) x = x->left;
            else if (x->key < key) x = x->right;
            else              return true;
        }
        return false;
    }

    /**
     * Does this symbol table contain the given key?
     * @param key the key
     * @return {@code true} if this symbol table contains {@code key} and
     *     {@code false} otherwise
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    bool containsKey(const K& key) {
        return get(key) != nullptr;
    }

    /***************************************************************************
     *  Red-black tree insertion.
     ***************************************************************************/

    /**
     * Inserts the specified key-value pair into the symbol table, overwriting the old
     * value with the new value if the symbol table already contains the specified key.
     * Deletes the specified key (and its associated value) from this symbol table
     * if the specified value is {@code null}.
     *
     * @param key the key
     * @param val the value
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    bool innerPut(const K& key, const V& value) {
    	bool ret = false;
        root = put(root, key, value, ret);
        root->color = COLOR_BLACK;
        return ret;
    }

    // insert the key-value pair in the subtree rooted at h
    Node* put(Node* h, const K& key, const V& val, bool& ret) {
        if (h == nullptr) {
        	ret = true;
        	return estm::tmNew<Node>(key, val, COLOR_RED, 1);
        }
        if      (key < h->key) h->left  = put(h->left,  key, val, ret);
        else if (h->key < key) h->right = put(h->right, key, val, ret);
        else              h->val   = val;
        // fix-up any right-leaning links
        if (isRed(h->right) && !isRed(h->left))       h = rotateLeft(h);
        if (isRed(h->left)  &&  isRed(h->left->left)) h = rotateRight(h);
        if (isRed(h->left)  &&  isRed(h->right))      flipColors(h);
        h->size = size(h->left) + size(h->right) + 1;

        return h;
    }

    /***************************************************************************
     *  Red-black tree deletion.
     ***************************************************************************/

    /**
     * Removes the smallest key and associated value from the symbol table.
     * @throws NoSuchElementException if the symbol table is empty
     */
    void deleteMin() {
        if (isEmpty()) return;
        // if both children of root are black, set root to red
        if (!isRed(root->left) && !isRed(root->right))
            root->color = COLOR_RED;
        assignAndFreeIfNull(root, deleteMin(root));
        if (!isEmpty()) root->color = COLOR_BLACK;
        // assert check();
    }

    // delete the key-value pair with the minimum key rooted at h
    Node* deleteMin(Node* h) {
        if (h->left == nullptr)
            return nullptr;
        if (!isRed(h->left) && !isRed(h->left->left))
            h = moveRedLeft(h);
        assignAndFreeIfNull(h->left, deleteMin(h->left));
        return balance(h);
    }


    /**
     * Removes the largest key and associated value from the symbol table.
     * @throws NoSuchElementException if the symbol table is empty
     */
    void deleteMax() {
        if (isEmpty()) return;

        // if both children of root are black, set root to red
        if (!isRed(root->left) && !isRed(root->right))
            root->color = COLOR_RED;

        root = deleteMax(root);
        if (!isEmpty()) root->color = COLOR_BLACK;
        // assert check();
    }

    // delete the key-value pair with the maximum key rooted at h
    Node* deleteMax(Node* h) {
        if (isRed(h->left))
            h = rotateRight(h);

        if (h->right == nullptr)
            return nullptr;

        if (!isRed(h->right) && !isRed(h->right->left))
            h = moveRedRight(h);

        h->right = deleteMax(h->right);

        return balance(h);
    }

    /**
     * Removes the specified key and its associated value from this symbol table
     * (if the key is in this symbol table).
     *
     * @param  key the key
     */
    void innerRemove(const K& key) {
        // if both children of root are black, set root to red
        if (!isRed(root->left) && !isRed(root->right)) root->color = COLOR_RED;
        assignAndFreeIfNull(root, deleteKey(root, key));
        if (!isEmpty()) root->color = COLOR_BLACK;
        // assert check();
    }

    // delete the key-value pair with the given key rooted at h
    Node* deleteKey(Node* h, const K& key) {
        // assert get(h, key) != null;
        if (key < h->key)  {
            if (!isRed(h->left) && !isRed(h->left->left)) {
                h = moveRedLeft(h);
            }
            assignAndFreeIfNull(h->left, deleteKey(h->left, key));
        } else {
            if (isRed(h->left)) {
                h = rotateRight(h);
            }
            if (key == h->key && (h->right == nullptr)) {
                return nullptr;
            }
            if (!isRed(h->right) && !isRed(h->right->left)) {
                h = moveRedRight(h);
            }
            if (key == h->key) {
                Node* x = min(h->right);
                h->key = x->key;
                h->val = x->val;
                // h->val = get(h->right, min(h->right).key);
                // h->key = min(h->right).key;
                assignAndFreeIfNull(h->right, deleteMin(h->right));
            } else {
                assignAndFreeIfNull(h->right, deleteKey(h->right, key));
            }
        }
        return balance(h);
    }

    /***************************************************************************
     *  Red-black tree helper functions.
     ***************************************************************************/

    // make a left-leaning link lean to the right
    Node* rotateRight(Node* h) {
        // assert (h != null) && isRed(h->left);
        Node* x = h->left;
        h->left = x->right;
        x->right = h;
        x->color = x->right->color;
        x->right->color = COLOR_RED;
        x->size = h->size;
        h->size = size(h->left) + size(h->right) + 1;
        return x;
    }

    // make a right-leaning link lean to the left
    Node* rotateLeft(Node* h) {
        // assert (h != null) && isRed(h->right);
        Node* x = h->right;
        h->right = x->left;
        x->left = h;
        x->color = x->left->color;
        x->left->color = COLOR_RED;
        x->size = h->size;
        h->size = size(h->left) + size(h->right) + 1;
        return x;
    }

    // flip the colors of a node and its two children
    void flipColors(Node* h) {
        // h must have opposite color of its two children
        // assert (h != null) && (h->left != null) && (h->right != null);
        // assert (!isRed(h) &&  isRed(h->left) &&  isRed(h->right))
        //    || (isRed(h)  && !isRed(h->left) && !isRed(h->right));
        h->color = !h->color;
        h->left->color = !h->left->color;
        h->right->color = !h->right->color;
    }

    // Assuming that h is red and both h->left and h->left.left
    // are black, make h->left or one of its children red.
    Node* moveRedLeft(Node* h) {
        // assert (h != null);
        // assert isRed(h) && !isRed(h->left) && !isRed(h->left.left);

        flipColors(h);
        if (isRed(h->right->left)) {
            h->right = rotateRight(h->right);
            h = rotateLeft(h);
            flipColors(h);
        }
        return h;
    }

    // Assuming that h is red and both h->right and h->right.left
    // are black, make h->right or one of its children red.
    Node* moveRedRight(Node* h) {
        // assert (h != null);
        // assert isRed(h) && !isRed(h->right) && !isRed(h->right.left);
        flipColors(h);
        if (isRed(h->left->left)) {
            h = rotateRight(h);
            flipColors(h);
        }
        return h;
    }

    // restore red-black tree invariant
    Node* balance(Node* h) {
        // assert (h != null);

        if (isRed(h->right))                        h = rotateLeft(h);
        if (isRed(h->left) && isRed(h->left->left)) h = rotateRight(h);
        if (isRed(h->left) && isRed(h->right))      flipColors(h);

        h->size = size(h->left) + size(h->right) + 1;
        return h;
    }


    /***************************************************************************
     *  Utility functions.
     ***************************************************************************/

    /**
     * Returns the height of the BST (for debugging).
     * @return the height of the BST (a 1-node tree has height 0)
     */
    int height() {
        return height(root);
    }
    int height(Node* x) {
        if (x == nullptr) return -1;
        return 1 + std::max(height(x->left), height(x->right));
    }

    /***************************************************************************
     *  Ordered symbol table methods.
     ***************************************************************************/

    /**
     * Returns the smallest key in the symbol table.
     * @return the smallest key in the symbol table
     * @throws NoSuchElementException if the symbol table is empty
     */
    K* min() {
        if (isEmpty()) return nullptr;
        return min(root).key;
    }

    // the smallest key in subtree rooted at x; null if no such key
    Node* min(Node* x) {
        // assert x != null;
        if (x->left == nullptr) return x;
        else                return min(x->left);
    }

    /**
     * Returns the largest key in the symbol table.
     * @return the largest key in the symbol table
     * @throws NoSuchElementException if the symbol table is empty
     */
    K* max() {
        if (isEmpty()) return nullptr;
        return max(root).key;
    }

    // the largest key in the subtree rooted at x; null if no such key
    Node* max(Node* x) {
        // assert x != null;
        if (x->right == nullptr) return x;
        else                 return max(x->right);
    }


    /**
     * Returns the largest key in the symbol table less than or equal to {@code key}.
     * @param key the key
     * @return the largest key in the symbol table less than or equal to {@code key}
     * @throws NoSuchElementException if there is no such key
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    K* floor(const K& key) {
        if (key == nullptr) return nullptr;
        if (isEmpty()) return nullptr;
        Node* x = floor(root, key);
        if (x == nullptr) return nullptr;
        else           return x->key;
    }

    // the largest key in the subtree rooted at x less than or equal to the given key
    Node* floor(Node* x, const K& key) {
        if (x == nullptr) return nullptr;
        if (key == x->key) return x;
        if (key < x->key)  return floor(x->left, key);
        Node* t = floor(x->right, key);
        if (t != nullptr) return t;
        else           return x;
    }

    /**
     * Returns the smallest key in the symbol table greater than or equal to {@code key}.
     * @param key the key
     * @return the smallest key in the symbol table greater than or equal to {@code key}
     * @throws NoSuchElementException if there is no such key
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    K* ceiling(const K& key) {
        if (key == nullptr) return nullptr;
        if (isEmpty()) return nullptr;
        Node* x = ceiling(root, key);
        if (x == nullptr) return nullptr;
        else           return x->key;
    }

    // the smallest key in the subtree rooted at x greater than or equal to the given key
    Node* ceiling(Node* x, const K& key) {
        if (x == nullptr) return nullptr;
        if (key == x->key) return x;
        if (x->key < key)  return ceiling(x->right, key);
        Node* t = ceiling(x->left, key);
        if (t != nullptr) return t;
        else           return x;
    }

    /**
     * Return the kth smallest key in the symbol table.
     * @param k the order statistic
     * @return the {@code k}th smallest key in the symbol table
     * @throws IllegalArgumentException unless {@code k} is between 0 and
     *     <em>n</em>1
     */
    K* select(int k) {
        if (k < 0 || k >= size()) {
            return nullptr;
        }
        Node x = select(root, k);
        return x->key;
    }

    // the key of rank k in the subtree rooted at x
    Node* select(Node* x, int k) {
        // assert x != null;
        // assert k >= 0 && k < size(x);
        int t = size(x->left);
        if      (t > k) return select(x->left,  k);
        else if (t < k) return select(x->right, k-t-1);
        else            return x;
    }

    /**
     * Return the number of keys in the symbol table strictly less than {@code key}.
     * @param key the key
     * @return the number of keys in the symbol table strictly less than {@code key}
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    int rank(const K& key) {
        if (key == nullptr) return -1;
        return rank(key, root);
    }

    // number of keys less than key in the subtree rooted at x
    int rank(const K& key, Node* x) {
        if (x == nullptr) return 0;
        if      (key < x->key) return rank(key, x->left);
        else if (x->key < key) return 1 + size(x->left) + rank(key, x->right);
        else              return size(x->left);
    }

    /***************************************************************************
     *  Range count and range search->
     ***************************************************************************/


    /**
     * Returns the number of keys in the symbol table in the given range.
     *
     * @param  lo minimum endpoint
     * @param  hi maximum endpoint
     * @return the number of keys in the sybol table between {@code lo}
     *    (inclusive) and {@code hi} (inclusive)
     * @throws IllegalArgumentException if either {@code lo} or {@code hi}
     *    is {@code null}
     */
    int size(const K& lo, const K& hi) {
        if (lo == nullptr) return 0;
        if (hi == nullptr) return 0;

        if (hi < lo) return 0;
        if (containsKey(hi)) return rank(hi) - rank(lo) + 1;
        else              return rank(hi) - rank(lo);
    }


    /***************************************************************************
     *  Check integrity of red-black tree data structure.
     ***************************************************************************/
    bool check() {
        if (!isBST())            std::cout << "Not in symmetric order\n";
        if (!isSizeConsistent()) std::cout << "Subtree counts not consistent\n";
        //if (!isRankConsistent()) std::cout << "Ranks not consistent\n";
        if (!is23())             std::cout << "Not a 2-3 tree\n";
        if (!isBalanced())       std::cout << "Not balanced\n";
        return isBST() && isSizeConsistent() && is23() && isBalanced();
    }

    // does this binary tree satisfy symmetric order?
    // Note: this test also ensures that data structure is a binary tree since order is strict
    bool isBST() {
        return isBST(root, nullptr, nullptr);
    }

    // is the tree rooted at x a BST with all keys strictly between min and max
    // (if min or max is null, treat as empty constraint)
    // Credit: Bob Dondero's elegant solution
    bool isBST(Node* x, K* min, K* max) {
        if (x == nullptr) return true;
        // TODO: port these two lines
        //if (min != nullptr && x->key.compareTo(min) <= 0) return false;
        //if (max != nullptr && x->key.compareTo(max) >= 0) return false;
        return isBST(x->left, min, x->key) && isBST(x->right, x->key, max);
    }

    // are the size fields correct?
    bool isSizeConsistent() { return isSizeConsistent(root); }
    bool isSizeConsistent(Node* x) {
        if (x == nullptr) return true;
        if (x->size != size(x->left) + size(x->right) + 1) return false;
        return isSizeConsistent(x->left) && isSizeConsistent(x->right);
    }

    /*
    // check that ranks are consistent
    bool isRankConsistent() {
        for (int i = 0; i < size(); i++)
            if (i != rank(select(i))) return false;
        for (K* key : keys())
            if (key.compareTo(select(rank(key))) != 0) return false;
        return true;
    }
    */

    // Does the tree have no red right links, and at most one (left)
    // red links in a row on any path?
    bool is23() { return is23(root); }
    bool is23(Node* x) {
        if (x == nullptr) return true;
        if (isRed(x->right)) return false;
        if (x != root && isRed(x) && isRed(x->left))
            return false;
        return is23(x->left) && is23(x->right);
    }

    // do all paths from root to leaf have same number of black edges?
    bool isBalanced() {
        int black = 0;     // number of black links on path from root to min
        Node x = root;
        while (x != nullptr) {
            if (!isRed(x)) black++;
            x = x->left;
        }
        return isBalanced(root, black);
    }

    // does every path from the root to a leaf have the given number of black links?
    bool isBalanced(Node* x, int black) {
        if (x == nullptr) return black == 0;
        if (!isRed(x)) black--;
        return isBalanced(x->left, black) && isBalanced(x->right, black);
    }


    // Inserts a key only if it's not already present
    bool add(const K& key, const int tid=0) {
        bool retval = false;
        estm::updateTx([&] () {
        	retval = innerPut(key,key);
        });
        return retval;
    }

    // Returns true only if the key was present
    bool remove(K& key, const int tid=0) {
        bool retval = false;
        estm::updateTx([&] () {
            V notused;
            retval = innerGet(key,notused,false);
            if (retval) innerRemove(key);
        });
        return retval;
    }

    bool contains(K& key, const int tid=0) {
        bool retval = false;
        estm::readTx([&] () {
            V notused;
            retval = innerGet(key,notused,false);
        });
        return retval;
    }

    // This is not fully transactionally but it's ok because we use it only on initialization.
    // We could make it fully transactionally, but we would have to increase the size of allocation/store logs.
    void addAll(K** keys, int size, const int tid=0) {
        for (int i = 0; i < size; i++) add(*keys[i], tid);
    }

    static std::string className() { return estm::ESTM::className() + "-RedBlackTree"; }

};

#endif   // _ESTM_RED_BLACK_BST_H_


================================================
FILE: datastructures/treemaps/HazardEras.hpp
================================================
/******************************************************************************
 * Copyright (c) 2016-2017, Pedro Ramalhete, Andreia Correia
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of Concurrency Freaks nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.

 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************************************************************
 */

#ifndef _HAZARD_ERAS_H_
#define _HAZARD_ERAS_H_

#include <atomic>
#include <iostream>
#include <vector>
#include <algorithm>

/*
 * <h1> Hazard Eras </h1>
 * This a light-weight implementation of hazard eras, where each thread has a
 * thread-local list of retired objects.
 *
 * This is based on the paper "Hazard Eras - Non-Blocking Memory Reclamation"
 * by Pedro Ramalhete and Andreia Correia:
 * https://github.com/pramalhe/ConcurrencyFreaks/blob/master/papers/hazarderas-2017.pdf
 *
 * The type T is for the objects/nodes and it's it must have the members newEra, delEra
 *
 * R is zero.
 *
 * <p>
 * @author Pedro Ramalhete
 * @author Andreia Correia
 */
template<typename T>
class HazardEras {

private:
    static const uint64_t NONE = 0;
    static const int      HE_MAX_THREADS = 128;
    static const int      MAX_HES = 5;        // This is named 'K' in the HP paper
    static const int      CLPAD = 128/sizeof(std::atomic<T*>);
    static const int      HE_THRESHOLD_R = 0; // This is named 'R' in the HP paper

    const int             maxHEs;
    const int             maxThreads;

    alignas(128) std::atomic<uint64_t>  eraClock {1};
    alignas(128) std::atomic<uint64_t>* he[HE_MAX_THREADS];
    // It's not nice that we have a lot of empty vectors, but we need padding to avoid false sharing
    alignas(128) std::vector<T*>        retiredList[HE_MAX_THREADS*CLPAD];

public:
    HazardEras(int maxHEs=MAX_HES, int maxThreads=HE_MAX_THREADS) : maxHEs{maxHEs}, maxThreads{maxThreads} {
        for (int it = 0; it < HE_MAX_THREADS; it++) {
            he[it] = new std::atomic<uint64_t>[CLPAD*2]; // We allocate four cache lines to allow for many hps and without false sharing
            retiredList[it*CLPAD].reserve(maxThreads*maxHEs);
            for (int ihe = 0; ihe < MAX_HES; ihe++) {
                he[it][ihe].store(NONE, std::memory_order_relaxed);
            }
        }
        static_assert(std::is_same<decltype(T::newEra), uint64_t>::value, "T::newEra must be uint64_t");
        static_assert(std::is_same<decltype(T::delEra), uint64_t>::value, "T::delEra must be uint64_t");
    }

    ~HazardEras() {
        for (int it = 0; it < HE_MAX_THREADS; it++) {
            delete[] he[it];
            // Clear the current retired nodes
            for (unsigned iret = 0; iret < retiredList[it*CLPAD].size(); iret++) {
                delete retiredList[it*CLPAD][iret];
            }
        }
    }


    inline uint64_t getEra() {
        return eraClock.load();
    }


    /**
     * Progress Condition: wait-free bounded (by maxHEs)
     */
    inline void clear(const int tid) {
        for (int ihe = 0; ihe < maxHEs; ihe++) {
            he[tid][ihe].store(NONE, std::memory_order_release);
        }
    }


    /**
     * Progress Condition: lock-free
     */
    inline T* get_protected(int index, const std::atomic<T*>& atom, const int tid) {
        auto prevEra = he[tid][index].load(std::memory_order_relaxed);
		while (true) {
		    T* ptr = atom.load();
		    auto era = eraClock.load(std::memory_order_acquire);
		    if (era == prevEra) return ptr;
            he[tid][index].store(era);
            prevEra = era;
		}
    }

    inline void protectEraRelease(int index, int other, const int tid) {
        auto era = he[tid][other].load(std::memory_order_relaxed);
        if (he[tid][index].load(std::memory_order_relaxed) == era) return;
        he[tid][index].store(era, std::memory_order_release);
    }


    /*
     * Does a single iteration. Must be integrated into the algorithm that's using HE.
     * In other words, we must re-check if era has changed
     *
     * Progress Condition: wait-free population oblivious
     */
    inline T* protectPtr(int index, const std::atomic<T*>& atom, uint64_t& prevEra, const int tid) {
        T* ptr = atom.load(std::memory_order_acquire);
        auto era = eraClock.load();
        if (prevEra != era) {
            prevEra = era;
            he[tid][index].store(era, std::memory_order_relaxed);
            std::atomic_thread_fence(std::memory_order_seq_cst);
        }
        return ptr;
    }


    /**
     * Retire an object (node)
     * Progress Condition: wait-free bounded
     *
     * Doing rlist.erase() is not the most efficient way to remove entries from a std::vector, but ok...
     */
    void retire(T* ptr, const int mytid) {
        auto currEra = eraClock.load();
        ptr->delEra = currEra;
        auto& rlist = retiredList[mytid*CLPAD];
        rlist.push_back(ptr);
        if (eraClock == currEra) eraClock.fetch_add(1);
        for (unsigned iret = 0; iret < rlist.size();) {
            auto obj = rlist[iret];
            if (canDelete(obj, mytid)) {
                rlist.erase(rlist.begin() + iret);
                delete obj;
                continue;
            }
            iret++;
        }
    }

private:
    bool canDelete(T* obj, const int mytid) {
        for (int tid = 0; tid < maxThreads; tid++) {
            for (int ihe = 0; ihe < maxHEs; ihe++) {
                const auto era = he[tid][ihe].load(std::memory_order_acquire);
                if (era == NONE || era < obj->newEra || era > obj->delEra) continue;
                return false;
            }
        }
        return true;
    }

};

#endif /* _HAZARD_ERAS_H_ */


================================================
FILE: datastructures/treemaps/NatarajanTreeHE.hpp
================================================
/*

Copyright 2017 University of Rochester

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

Adapted from https://github.com/roghnin/Interval-Based-Reclamation/blob/master/src/rideables/NatarajanTree.hpp

Due to the usage of <optional>, this needs C++17 to compile

Pedro: I've adapted this for our benchmarks but the adaptation may contain errors, please do not use this code in production!
*/


#ifndef _NATARAJAN_TREE_HAZARD_ERAS_H_
#define _NATARAJAN_TREE_HAZARD_ERAS_H_

#include <iostream>
#include <atomic>
#include <algorithm>
#include <map>
#include <optional>
#include "common/HazardEras.hpp"


template <class K, class V>
class NatarajanTreeHE {
private:
    const int MAX_THREADS = 128;
    /* structs*/
    struct Node {
        int level;
        K key;
        V val;
        std::atomic<Node*> left;
        std::atomic<Node*> right;
        uint64_t newEra {0};          // TODO: put he.getEra() here
        uint64_t delEra;
        Node(uint64_t newEra) : newEra{newEra} {};
        Node(uint64_t newEra, K k, V v, Node* l, Node* r,int lev):level(lev),key(k),val(v),left(l),right(r),newEra{newEra} {};
        Node(uint64_t newEra, K k, V v, Node* l, Node* r):level(-1),key(k),val(v),left(l),right(r),newEra{newEra} {};
    };
    struct SeekRecord{
        Node* ancestor;
        Node* successor;
        Node* parent;
        Node* leaf;
    };

    /* variables */
    HazardEras<Node> he {5, MAX_THREADS};

    K infK{};
    V defltV{};
    Node* r;
    Node* s;
    SeekRecord* records;
    const size_t GET_POINTER_BITS = 0xfffffffffffffffc;//for machine 64-bit or less.

    /* helper functions */
    //flag and tags helpers
    inline Node* getPtr(Node* mptr){
        return (Node*) ((size_t)mptr & GET_POINTER_BITS);
    }
    inline bool getFlg(Node* mptr){
        return (bool)((size_t)mptr & 1);
    }
    inline bool getTg(Node* mptr){
        return (bool)((size_t)mptr & 2);
    }
    inline Node* mixPtrFlgTg(Node* ptr, bool flg, bool tg){
        return (Node*) ((size_t)ptr | flg | ((size_t)tg<<1));
    }
    //node comparison
    inline bool isInf(Node* n){
        return getInfLevel(n)!=-1;
    }
    inline int getInfLevel(Node* n){
        //0 for inf0, 1 for inf1, 2 for inf2, -1 for general val
        n=getPtr(n);
        return n->level;
    }
    inline bool nodeLess(Node* n1, Node* n2){
        n1=getPtr(n1);
        n2=getPtr(n2);
        int i1=getInfLevel(n1);
        int i2=getInfLevel(n2);
        return i1<i2 || (i1==-1&&i2==-1&&n1->key<n2->key);
    }
    inline bool nodeEqual(Node* n1, Node* n2){
        n1=getPtr(n1);
        n2=getPtr(n2);
        int i1=getInfLevel(n1);
        int i2=getInfLevel(n2);
        if(i1==-1&&i2==-1)
            return n1->key==n2->key;
        else
            return i1==i2;
    }
    inline bool nodeLessEqual(Node* n1, Node* n2){
        return !nodeLess(n2,n1);
    }

    /* private interfaces */
    void seek(K key, int tid);
    bool cleanup(K key, int tid);
    void doRangeQuery(Node& k1, Node& k2, int tid, Node* root, std::map<K,V>& res);
public:
    NatarajanTreeHE(const int maxThreads=0) {
        r = new Node(he.getEra(), infK,defltV,nullptr,nullptr,2);
        s = new Node(he.getEra(), infK,defltV,nullptr,nullptr,1);
        r->right = new Node(he.getEra(), infK,defltV,nullptr,nullptr,2);
        r->left = s;
        s->right = new Node(he.getEra(), infK,defltV,nullptr,nullptr,1);
        s->left = new Node(he.getEra(), infK,defltV,nullptr,nullptr,0);
        records = new SeekRecord[MAX_THREADS]{};
    };
    ~NatarajanTreeHE(){};

    static std::string className() { return "NatarajanTreeHE"; }
    std::optional<V> get(K key, int tid);
    std::optional<V> put(K key, V val, int tid);
    bool insert(K key, V val, int tid);
    std::optional<V> innerRemove(K key, int tid);
    std::optional<V> replace(K key, V val, int tid);
    std::map<K, V> rangeQuery(K key1, K key2, int& len, int tid);

    // Used only by our tree benchmarks
    bool add(K key, int tid);
    bool remove(K key, int tid);
    bool contains(K key, int tid);
    void addAll(K** keys, const int size, const int tid);
};

//-------Definition----------
template <class K, class V>
void NatarajanTreeHE<K,V>::seek(K key, int tid){
    /* initialize the seek record using sentinel nodes */
    Node keyNode{he.getEra(),key,defltV,nullptr,nullptr};//node to be compared
    SeekRecord* seekRecord = &(records[tid]);
    seekRecord->ancestor = r;
    seekRecord->successor = he.get_protected(1, r->left, tid);
    seekRecord->parent = he.get_protected(2, r->left, tid);
    seekRecord->leaf = getPtr(he.get_protected(3, s->left, tid));

    /* initialize other variables used in the traversal */
    Node* parentField = he.get_protected(3, seekRecord->parent->left, tid);
    Node* currentField = he.get_protected(4, seekRecord->leaf->left,tid);
    Node* current = getPtr(currentField);

    /* traverse the tree */
    while(current!=nullptr){
        /* check if the edge from the current parent node is tagged */
        if(!getTg(parentField)){
            /*
             * found an untagged edge in the access path;
             * advance ancestor and successor pointers.
             */
            seekRecord->ancestor=seekRecord->parent;
            he.protectEraRelease(0, 1, tid);
            seekRecord->successor=seekRecord->leaf;
            he.protectEraRelease(1, 3, tid);
        }

        /* advance parent and leaf pointers */
        seekRecord->parent = seekRecord->leaf;
        he.protectEraRelease(2, 3, tid);
        seekRecord->leaf = current;
        he.protectEraRelease(3, 4, tid);

        /* update other variables used in traversal */
        parentField=currentField;
        if(nodeLess(&keyNode,current)){
            currentField = he.get_protected(4, current->left, tid);
        }
        else{
            currentField = he.get_protected(4, current->right, tid);
        }
        current=getPtr(currentField);
    }
    /* traversal complete */
    return;
}

template <class K, class V>
bool NatarajanTreeHE<K,V>::cleanup(K key, int tid){
    Node keyNode{he.getEra(),key,defltV,nullptr,nullptr};//node to be compared
    bool res=false;

    /* retrieve addresses stored in seek record */
    SeekRecord* seekRecord=&(records[tid]);
    Node* ancestor=getPtr(seekRecord->ancestor);
    Node* successor=getPtr(seekRecord->successor);
    Node* parent=getPtr(seekRecord->parent);
    Node* leaf=getPtr(seekRecord->leaf);

    std::atomic<Node*>* successorAddr=nullptr;
    std::atomic<Node*>* childAddr=nullptr;
    std::atomic<Node*>* siblingAddr=nullptr;

    /* obtain address of field of ancestor node that will be modified */
    if(nodeLess(&keyNode,ancestor))
        successorAddr=&(ancestor->left);
    else
        successorAddr=&(ancestor->right);

    /* obtain addresses of child fields of parent node */
    if(nodeLess(&keyNode,parent)){
        childAddr=&(parent->left);
        siblingAddr=&(parent->right);
    }
    else{
        childAddr=&(parent->right);
        siblingAddr=&(parent->left);
    }
    Node* tmpChild=childAddr->load(std::memory_order_acquire);
    if(!getFlg(tmpChild)){
        /* the leaf is not flagged, thus sibling node should be flagged */
        tmpChild=siblingAddr->load(std::memory_order_acquire);
        /* switch the sibling address */
        siblingAddr=childAddr;
    }

    /* use TAS to tag sibling edge */
    while(true){
        Node* untagged=siblingAddr->load(std::memory_order_acquire);
        Node* tagged=mixPtrFlgTg(getPtr(untagged),getFlg(untagged),true);
        if(siblingAddr->compare_exchange_strong(untagged,tagged,std::memory_order_acq_rel)){
            break;
        }
    }
    /* read the flag and address fields */
    Node* tmpSibling=siblingAddr->load(std::memory_order_acquire);

    /* make the sibling node a direct child of the ancestor node */
    res=successorAddr->compare_exchange_strong(successor,
        mixPtrFlgTg(getPtr(tmpSibling),getFlg(tmpSibling),false),
        std::memory_order_acq_rel);

    if(res==true){
        he.retire(getPtr(tmpChild),tid);
        he.retire(successor,tid);
    }
    return res;
}

/* to test rangeQuery */
// template <>
// optional<int> NatarajanTree<int,int>::get(int key, int tid){
//  int len=0;
//  auto x = rangeQuery(key-500,key,len,tid);
//  Node keyNode{key,defltV,nullptr,nullptr};//node to be compared
//  optional<int> res={};
//  SeekRecord* seekRecord=&(records[tid].ui);
//  Node* leaf=nullptr;
//  seek(key,tid);
//  leaf=getPtr(seekRecord->leaf);
//  if(nodeEqual(&keyNode,leaf)){
//      res = leaf->val;
//  }
//  return res;
// }

template <class K, class V>
std::optional<V> NatarajanTreeHE<K,V>::get(K key, int tid){
    Node keyNode{he.getEra(),key,defltV,nullptr,nullptr};//node to be compared
    std::optional<V> res={};
    SeekRecord* seekRecord=&(records[tid]);
    Node* leaf=nullptr;
    seek(key,tid);
    leaf=getPtr(seekRecord->leaf);
    if(nodeEqual(&keyNode,leaf)){
        res = leaf->val;
    }
    he.clear(tid);
    return res;
}

template <class K, class V>
std::optional<V> NatarajanTreeHE<K,V>::put(K key, V val, int tid){
    std::optional<V> res={};
    SeekRecord* seekRecord=&(records[tid]);

    Node* newInternal=nullptr;
    Node* newLeaf = new Node(he.getEra(),key,val,nullptr,nullptr);//also to compare keys

    Node* parent=nullptr;
    Node* leaf=nullptr;
    std::atomic<Node*>* childAddr=nullptr;

    while(true){
        seek(key,tid);
        leaf=getPtr(seekRecord->leaf);
        parent=getPtr(seekRecord->parent);
        if(!nodeEqual(newLeaf,leaf)){//key does not exist
            /* obtain address of the child field to be modified */
            if(nodeLess(newLeaf,parent))
                childAddr=&(parent->left);
            else
                childAddr=&(parent->right);

            /* create left and right leave of newInternal */
            Node* newLeft=nullptr;
            Node* newRight=nullptr;
            if(nodeLess(newLeaf,leaf)){
                newLeft=newLeaf;
                newRight=leaf;
            }
            else{
                newLeft=leaf;
                newRight=newLeaf;
            }

            /* create newInternal */
            if(isInf(leaf)){
                int lev=getInfLevel(leaf);
                newInternal = new Node(he.getEra(),infK,defltV,newLeft,newRight,lev);
            }
            else
                newInternal = new Node(he.getEra(),std::max(key,leaf->key),defltV,newLeft,newRight);

            /* try to add the new nodes to the tree */
            Node* tmpExpected=getPtr(leaf);
            if(childAddr->compare_exchange_strong(tmpExpected,getPtr(newInternal),std::memory_order_acq_rel)){
                res={};
                break;//insertion succeeds
            }
            else{//fails; help conflicting delete operation
                delete newInternal;
                Node* tmpChild=childAddr->load(std::memory_order_acquire);
                if(getPtr(tmpChild)==leaf && (getFlg(tmpChild)||getTg(tmpChild))){
                    /*
                     * address of the child has not changed
                     * and either the leaf node or its sibling
                     * has been flagged for deletion
                     */
                    cleanup(key,tid);
                }
            }
        }
        else{//key exists, update and return old
            res=leaf->val;
            if(nodeLess(newLeaf,parent))
                childAddr=&(parent->left);
            else
                childAddr=&(parent->right);
            if(childAddr->compare_exchange_strong(leaf,newLeaf,std::memory_order_acq_rel)){
                he.retire(leaf,tid);
                break;
            }
        }
    }
    he.clear(tid);
    return res;
}

template <class K, class V>
bool NatarajanTreeHE<K,V>::insert(K key, V val, int tid) {
    bool res=false;
    SeekRecord* seekRecord=&(records[tid]);

    Node* newInternal=nullptr;
    Node* newLeaf = new Node(he.getEra(),key,val,nullptr,nullptr);//also for comparing keys

    Node* parent=nullptr;
    Node* leaf=nullptr;
    std::atomic<Node*>* childAddr=nullptr;
    while(true){
        seek(key,tid);
        leaf=getPtr(seekRecord->leaf);
        parent=getPtr(seekRecord->parent);
        if(!nodeEqual(newLeaf,leaf)){//key does not exist
            /* obtain address of the child field to be modified */
            if(nodeLess(newLeaf,parent))
                childAddr=&(parent->left);
            else
                childAddr=&(parent->right);

            /* create left and right leave of newInternal */
            Node* newLeft=nullptr;
            Node* newRight=nullptr;
            if(nodeLess(newLeaf,leaf)){
                newLeft=newLeaf;
                newRight=leaf;
            }
            else{
                newLeft=leaf;
                newRight=newLeaf;
            }

            /* create newInternal */
            if(isInf(leaf)){
                int lev=getInfLevel(leaf);
                newInternal = new Node(he.getEra(),infK,defltV,newLeft,newRight,lev);
            }
            else
                newInternal = new Node(he.getEra(),std::max(key,leaf->key),defltV,newLeft,newRight);

            /* try to add the new nodes to the tree */
            Node* tmpExpected=getPtr(leaf);
            if(childAddr->compare_exchange_strong(tmpExpected,getPtr(newInternal),std::memory_order_acq_rel)){
                res=true;
                break;//insertion succeeds
            }
            else{//fails; help conflicting delete operation
                delete newInternal;
                Node* tmpChild=childAddr->load(std::memory_order_acquire);
                if(getPtr(tmpChild)==leaf && (getFlg(tmpChild)||getTg(tmpChild))){
                    /*
                     * address of the child has not changed
                     * and either the leaf node or its sibling
                     * has been flagged for deletion
                     */
                    cleanup(key,tid);
                }
            }
        }
        else{//key exists, insertion fails
            delete newLeaf;
            res=false;
            break;
        }
    }
    he.clear(tid);
    return res;
}

template <class K, class V>
std::optional<V> NatarajanTreeHE<K,V>::innerRemove(K key, int tid){
    bool injecting = true;
    std::optional<V> res={};
    SeekRecord* seekRecord=&(records[tid]);

    Node keyNode{he.getEra(),key,defltV,nullptr,nullptr};//node to be compared

    Node* parent=nullptr;
    Node* leaf=nullptr;
    std::atomic<Node*>* childAddr=nullptr;
    while(true){
        seek(key,tid);
        parent=getPtr(seekRecord->parent);
        /* obtain address of the child field to be modified */
        if(nodeLess(&keyNode,parent))
            childAddr=&(parent->left);
        else
            childAddr=&(parent->right);

        if(injecting){
            /* injection mode: check if the key exists */
            leaf=getPtr(seekRecord->leaf);
            if(!nodeEqual(leaf,&keyNode)){//does not exist
                res={};
                break;
            }

            /* inject the delete operation into the tree */
            Node* tmpExpected=getPtr(leaf);
            res=leaf->val;
            if(childAddr->compare_exchange_strong(tmpExpected,
                mixPtrFlgTg(tmpExpected,true,false), std::memory_order_acq_rel)){
                /* advance to cleanup mode to remove the leaf node */
                injecting=false;
                if(cleanup(key,tid)) break;
            }
            else{
                Node* tmpChild=childAddr->load(std::memory_order_acquire);
                if(getPtr(tmpChild)==leaf && (getFlg(tmpChild)||getTg(tmpChild))){
                    /*
                     * address of the child has not
                     * changed and either the leaf
                     * node or its sibling has been
                     * flagged for deletion
                     */
                    cleanup(key,tid);
                }
            }
        }
        else{
            /* cleanup mode: check if flagged node still exists */
            if(seekRecord->leaf!=leaf){
                /* leaf no longer in the tree */
                break;
            }
            else{
                /* leaf still in the tree; remove */
                if(cleanup(key,tid)) break;
            }
        }
    }
    he.clear(tid);
    return res;
}

template <class K, class V>
std::optional<V> NatarajanTreeHE<K,V>::replace(K key, V val, int tid){
    std::optional<V> res={};
    SeekRecord* seekRecord=&(records[tid]);

    Node* newInternal=nullptr;
    Node* newLeaf = new Node(he.getEra(),key,val,nullptr,nullptr);//also to compare keys

    Node* parent=nullptr;
    Node* leaf=nullptr;
    std::atomic<Node*>* childAddr=nullptr;
    while(true){
        seek(key,tid);
        parent=getPtr(seekRecord->parent);
        leaf=getPtr(seekRecord->leaf);
        if(!nodeEqual(newLeaf,leaf)){//key does not exist, replace fails
            delete newLeaf;
            res={};
            break;
        }
        else{//key exists, update and return old
            res=leaf->val;
            if(nodeLess(newLeaf,parent))
                childAddr=&(parent->left);
            else
                childAddr=&(parent->right);
            if(childAddr->compare_exchange_strong(leaf,newLeaf,std::memory_order_acq_rel)){
                he.retire(leaf,tid);
                break;
            }
        }
    }
    he.clear(tid);
    return res;
}

template <class K, class V>
std::map<K, V> NatarajanTreeHE<K,V>::rangeQuery(K key1, K key2, int& len, int tid){
    //NOT HP-like GC safe.
    if(key1>key2) return {};
    Node k1{he.getEra(),key1,defltV,nullptr,nullptr};//node to be compared
    Node k2{he.getEra(),key2,defltV,nullptr,nullptr};//node to be compared

    Node* leaf = getPtr(he.get_protected(0, s->left, tid));
    Node* current = getPtr(he.get_protected(1, leaf->left, tid));

    std::map<K,V> res;
    if(current!=nullptr)
        doRangeQuery(k1,k2,tid,current,res);
    len=res.size();
    return res;
}

template <class K, class V>
void NatarajanTreeHE<K,V>::doRangeQuery(Node& k1, Node& k2, int tid, Node* root, std::map<K,V>& res){
    Node* left = getPtr(he.get_protected(2, root->left, tid));
    Node* right = getPtr(he.get_protected(3, root->right, tid));
    if(left==nullptr&&right==nullptr){
        if(nodeLessEqual(&k1,root)&&nodeLessEqual(root,&k2)){
            res.emplace(root->key,root->val);
        }
        return;
    }
    if(left!=nullptr){
        if(nodeLess(&k1,root)){
            doRangeQuery(k1,k2,tid,left,res);
        }
    }
    if(right!=nullptr){
        if(nodeLessEqual(root,&k2)){
            doRangeQuery(k1,k2,tid,right,res);
        }
    }
    return;
}


// Wrappers for the "set" benchmarks
template <class K, class V>
bool NatarajanTreeHE<K,V>::add(K key, int tid) {
    return insert(key,key,tid);
}

template <class K, class V>
bool NatarajanTreeHE<K,V>::remove(K key, int tid) {
    return innerRemove(key,tid).has_value();
}

template <class K, class V>
bool NatarajanTreeHE<K,V>::contains(K key, int tid) {
    return get(key,tid).has_value();
}

// Not lock-free
template <class K, class V>
void NatarajanTreeHE<K,V>::addAll(K** keys, const int size, const int tid) {
    for (int i = 0; i < size; i++) add(*keys[i], tid);
}

#endif


================================================
FILE: datastructures/treemaps/OFLFRedBlackTree.hpp
================================================
#ifndef _OF_LF_RED_BLACK_BST_H_
#define _OF_LF_RED_BLACK_BST_H_

#include <cassert>
#include <stdexcept>
#include <algorithm>

#include "stms/OneFileLF.hpp"               // This header defines the macros for the STM being compiled

// Adapted from Java to C++ from the original at http://algs4.cs.princeton.edu/code/edu/princeton/cs/algs4/RedBlackBST.java
template<typename K, typename V>
class OFLFRedBlackTree {
    const int64_t COLOR_RED   = 0;
    const int64_t COLOR_BLACK = 1;

    struct Node : public oflf::tmbase {
        oflf::tmtype<K>       key;
        oflf::tmtype<V>       val;
        oflf::tmtype<Node*>   left {nullptr};
        oflf::tmtype<Node*>   right {nullptr};
        oflf::tmtype<int64_t> color;    // color of parent link
        oflf::tmtype<int64_t> size;     // subtree count
        Node(const K& key, const V& val, int64_t color, int64_t size) : key{key}, val{val}, color{color}, size{size} {}
    };

    oflf::tmtype<Node*> root {nullptr};   // root of the BST

    inline void assignAndFreeIfNull(oflf::tmtype<Node*>& z, Node* w) {
        Node* tofree = z;
        z = w;
        if (w == nullptr) oflf::tmDelete(tofree);
    }

public:
    /**
     * Initializes an empty symbol table.
     */
    OFLFRedBlackTree(int numThreads=0){ }

    ~OFLFRedBlackTree() {
        for (int i = 0; i < 10000; i++) {
            oflf::updateTx([&] () {
                if (root == nullptr) return;
                deleteMin();
            });
        }
    }

    /***************************************************************************
     *  Node helper methods.
     ***************************************************************************/
    // is node x red; false if x is null ?
    bool isRed(Node* x) {
        if (x == nullptr) return false;
        return x->color == COLOR_RED;
    }

    // number of node in subtree rooted at x; 0 if x is null
    int size(Node* x) {
        if (x == nullptr) return 0;
        return x->size;
    }


    /**
     * Returns the number of key-value pairs in this symbol table.
     * @return the number of key-value pairs in this symbol table
     */
    int size() {
        return size(root);
    }

    /**
     * Is this symbol table empty?
     * @return {@code true} if this symbol table is empty and {@code false} otherwise
     */
    bool isEmpty() {
        return root == nullptr;
    }


    /***************************************************************************
     *  Standard BST search->
     ***************************************************************************/

    /**
     * Returns the value associated with the given key.
     * @param key the key
     * @return the value associated with the given key if the key is in the symbol table
     *     and {@code null} if the key is not in the symbol table
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    bool innerGet(K& key, V& oldValue, const bool saveOldValue) {
        bool found = get(root, key);
        if (!found) return false;
        //if (saveOldValue) oldValue = *val; // Copy of V
        return true;
    }

    // value associated with the given key in subtree rooted at x; null if no such key
    bool get(Node* x, K& key) {
        while (x != nullptr) {
            if      (key < x->key) x = x->left;
            else if (x->key < key) x = x->right;
            else              return true;
        }
        return false;
    }

    /**
     * Does this symbol table contain the given key?
     * @param key the key
     * @return {@code true} if this symbol table contains {@code key} and
     *     {@code false} otherwise
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    bool containsKey(const K& key) {
        return get(key) != nullptr;
    }

    /***************************************************************************
     *  Red-black tree insertion.
     ***************************************************************************/

    /**
     * Inserts the specified key-value pair into the symbol table, overwriting the old
     * value with the new value if the symbol table already contains the specified key.
     * Deletes the specified key (and its associated value) from this symbol table
     * if the specified value is {@code null}.
     *
     * @param key the key
     * @param val the value
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    bool innerPut(const K& key, const V& value) {
    	bool ret = false;
        root = put(root, key, value, ret);
        root->color = COLOR_BLACK;
        return ret;
    }

    // insert the key-value pair in the subtree rooted at h
    Node* put(Node* h, const K& key, const V& val, bool& ret) {
        if (h == nullptr) {
            ret = true;
            return oflf::tmNew<Node>(key, val, COLOR_RED, 1);
        }
        if      (key < h->key) h->left  = put(h->left,  key, val, ret);
        else if (h->key < key) h->right = put(h->right, key, val, ret);
        else              h->val   = val;
        // fix-up any right-leaning links
        if (isRed(h->right) && !isRed(h->left))       h = rotateLeft(h);
        if (isRed(h->left)  &&  isRed(h->left->left)) h = rotateRight(h);
        if (isRed(h->left)  &&  isRed(h->right))      flipColors(h);
        h->size = size(h->left) + size(h->right) + 1;

        return h;
    }

    /***************************************************************************
     *  Red-black tree deletion.
     ***************************************************************************/

    /**
     * Removes the smallest key and associated value from the symbol table.
     * @throws NoSuchElementException if the symbol table is empty
     */
    void deleteMin() {
        if (isEmpty()) return;
        // if both children of root are black, set root to red
        if (!isRed(root->left) && !isRed(root->right))
            root->color = COLOR_RED;
        assignAndFreeIfNull(root, deleteMin(root));
        if (!isEmpty()) root->color = COLOR_BLACK;
        // assert check();
    }

    // delete the key-value pair with the minimum key rooted at h
    Node* deleteMin(Node* h) {
        if (h->left == nullptr)
            return nullptr;
        if (!isRed(h->left) && !isRed(h->left->left))
            h = moveRedLeft(h);
        assignAndFreeIfNull(h->left, deleteMin(h->left));
        return balance(h);
    }


    /**
     * Removes the largest key and associated value from the symbol table.
     * @throws NoSuchElementException if the symbol table is empty
     */
    void deleteMax() {
        if (isEmpty()) return;

        // if both children of root are black, set root to red
        if (!isRed(root->left) && !isRed(root->right))
            root->color = COLOR_RED;

        root = deleteMax(root);
        if (!isEmpty()) root->color = COLOR_BLACK;
        // assert check();
    }

    // delete the key-value pair with the maximum key rooted at h
    Node* deleteMax(Node* h) {
        if (isRed(h->left))
            h = rotateRight(h);

        if (h->right == nullptr)
            return nullptr;

        if (!isRed(h->right) && !isRed(h->right->left))
            h = moveRedRight(h);

        h->right = deleteMax(h->right);

        return balance(h);
    }

    /**
     * Removes the specified key and its associated value from this symbol table
     * (if the key is in this symbol table).
     *
     * @param  key the key
     */
    void innerRemove(const K& key) {
        // if both children of root are black, set root to red
        if (!isRed(root->left) && !isRed(root->right)) root->color = COLOR_RED;
        assignAndFreeIfNull(root, deleteKey(root, key));
        if (!isEmpty()) root->color = COLOR_BLACK;
        // assert check();
    }

    // delete the key-value pair with the given key rooted at h
    Node* deleteKey(Node* h, const K& key) {
        // assert get(h, key) != null;
        if (key < h->key)  {
            if (!isRed(h->left) && !isRed(h->left->left)) {
                h = moveRedLeft(h);
            }
            assignAndFreeIfNull(h->left, deleteKey(h->left, key));
        } else {
            if (isRed(h->left)) {
                h = rotateRight(h);
            }
            if (key == h->key && (h->right == nullptr)) {
                return nullptr;
            }
            if (!isRed(h->right) && !isRed(h->right->left)) {
                h = moveRedRight(h);
            }
            if (key == h->key) {
                Node* x = min(h->right);
                h->key = x->key;
                h->val = x->val;
                // h->val = get(h->right, min(h->right).key);
                // h->key = min(h->right).key;
                assignAndFreeIfNull(h->right, deleteMin(h->right));
            } else {
                assignAndFreeIfNull(h->right, deleteKey(h->right, key));
            }
        }
        return balance(h);
    }

    /***************************************************************************
     *  Red-black tree helper functions.
     ***************************************************************************/

    // make a left-leaning link lean to the right
    Node* rotateRight(Node* h) {
        // assert (h != null) && isRed(h->left);
        Node* x = h->left;
        h->left = x->right;
        x->right = h;
        x->color = x->right->color;
        x->right->color = COLOR_RED;
        x->size = h->size;
        h->size = size(h->left) + size(h->right) + 1;
        return x;
    }

    // make a right-leaning link lean to the left
    Node* rotateLeft(Node* h) {
        // assert (h != null) && isRed(h->right);
        Node* x = h->right;
        h->right = x->left;
        x->left = h;
        x->color = x->left->color;
        x->left->color = COLOR_RED;
        x->size = h->size;
        h->size = size(h->left) + size(h->right) + 1;
        return x;
    }

    // flip the colors of a node and its two children
    void flipColors(Node* h) {
        // h must have opposite color of its two children
        // assert (h != null) && (h->left != null) && (h->right != null);
        // assert (!isRed(h) &&  isRed(h->left) &&  isRed(h->right))
        //    || (isRed(h)  && !isRed(h->left) && !isRed(h->right));
        h->color = !h->color;
        h->left->color = !h->left->color;
        h->right->color = !h->right->color;
    }

    // Assuming that h is red and both h->left and h->left.left
    // are black, make h->left or one of its children red.
    Node* moveRedLeft(Node* h) {
        // assert (h != null);
        // assert isRed(h) && !isRed(h->left) && !isRed(h->left.left);

        flipColors(h);
        if (isRed(h->right->left)) {
            h->right = rotateRight(h->right);
            h = rotateLeft(h);
            flipColors(h);
        }
        return h;
    }

    // Assuming that h is red and both h->right and h->right.left
    // are black, make h->right or one of its children red.
    Node* moveRedRight(Node* h) {
        // assert (h != null);
        // assert isRed(h) && !isRed(h->right) && !isRed(h->right.left);
        flipColors(h);
        if (isRed(h->left->left)) {
            h = rotateRight(h);
            flipColors(h);
        }
        return h;
    }

    // restore red-black tree invariant
    Node* balance(Node* h) {
        // assert (h != null);

        if (isRed(h->right))                        h = rotateLeft(h);
        if (isRed(h->left) && isRed(h->left->left)) h = rotateRight(h);
        if (isRed(h->left) && isRed(h->right))      flipColors(h);

        h->size = size(h->left) + size(h->right) + 1;
        return h;
    }


    /***************************************************************************
     *  Utility functions.
     ***************************************************************************/

    /**
     * Returns the height of the BST (for debugging).
     * @return the height of the BST (a 1-node tree has height 0)
     */
    int height() {
        return height(root);
    }
    int height(Node* x) {
        if (x == nullptr) return -1;
        return 1 + std::max(height(x->left), height(x->right));
    }

    /***************************************************************************
     *  Ordered symbol table methods.
     ***************************************************************************/

    /**
     * Returns the smallest key in the symbol table.
     * @return the smallest key in the symbol table
     * @throws NoSuchElementException if the symbol table is empty
     */
    K* min() {
        if (isEmpty()) return nullptr;
        return min(root).key;
    }

    // the smallest key in subtree rooted at x; null if no such key
    Node* min(Node* x) {
        // assert x != null;
        if (x->left == nullptr) return x;
        else                return min(x->left);
    }

    /**
     * Returns the largest key in the symbol table.
     * @return the largest key in the symbol table
     * @throws NoSuchElementException if the symbol table is empty
     */
    K* max() {
        if (isEmpty()) return nullptr;
        return max(root).key;
    }

    // the largest key in the subtree rooted at x; null if no such key
    Node* max(Node* x) {
        // assert x != null;
        if (x->right == nullptr) return x;
        else                 return max(x->right);
    }


    /**
     * Returns the largest key in the symbol table less than or equal to {@code key}.
     * @param key the key
     * @return the largest key in the symbol table less than or equal to {@code key}
     * @throws NoSuchElementException if there is no such key
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    K* floor(const K& key) {
        if (key == nullptr) return nullptr;
        if (isEmpty()) return nullptr;
        Node* x = floor(root, key);
        if (x == nullptr) return nullptr;
        else           return x->key;
    }

    // the largest key in the subtree rooted at x less than or equal to the given key
    Node* floor(Node* x, const K& key) {
        if (x == nullptr) return nullptr;
        if (key == x->key) return x;
        if (key < x->key)  return floor(x->left, key);
        Node* t = floor(x->right, key);
        if (t != nullptr) return t;
        else           return x;
    }

    /**
     * Returns the smallest key in the symbol table greater than or equal to {@code key}.
     * @param key the key
     * @return the smallest key in the symbol table greater than or equal to {@code key}
     * @throws NoSuchElementException if there is no such key
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    K* ceiling(const K& key) {
        if (key == nullptr) return nullptr;
        if (isEmpty()) return nullptr;
        Node* x = ceiling(root, key);
        if (x == nullptr) return nullptr;
        else           return x->key;
    }

    // the smallest key in the subtree rooted at x greater than or equal to the given key
    Node* ceiling(Node* x, const K& key) {
        if (x == nullptr) return nullptr;
        if (key == x->key) return x;
        if (x->key < key)  return ceiling(x->right, key);
        Node* t = ceiling(x->left, key);
        if (t != nullptr) return t;
        else           return x;
    }

    /**
     * Return the kth smallest key in the symbol table.
     * @param k the order statistic
     * @return the {@code k}th smallest key in the symbol table
     * @throws IllegalArgumentException unless {@code k} is between 0 and
     *     <em>n</em>1
     */
    K* select(int k) {
        if (k < 0 || k >= size()) {
            return nullptr;
        }
        Node x = select(root, k);
        return x->key;
    }

    // the key of rank k in the subtree rooted at x
    Node* select(Node* x, int k) {
        // assert x != null;
        // assert k >= 0 && k < size(x);
        int t = size(x->left);
        if      (t > k) return select(x->left,  k);
        else if (t < k) return select(x->right, k-t-1);
        else            return x;
    }

    /**
     * Return the number of keys in the symbol table strictly less than {@code key}.
     * @param key the key
     * @return the number of keys in the symbol table strictly less than {@code key}
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    int rank(const K& key) {
        if (key == nullptr) return -1;
        return rank(key, root);
    }

    // number of keys less than key in the subtree rooted at x
    int rank(const K& key, Node* x) {
        if (x == nullptr) return 0;
        if      (key < x->key) return rank(key, x->left);
        else if (x->key < key) return 1 + size(x->left) + rank(key, x->right);
        else              return size(x->left);
    }

    /***************************************************************************
     *  Range count and range search->
     ***************************************************************************/


    /**
     * Returns the number of keys in the symbol table in the given range.
     *
     * @param  lo minimum endpoint
     * @param  hi maximum endpoint
     * @return the number of keys in the sybol table between {@code lo}
     *    (inclusive) and {@code hi} (inclusive)
     * @throws IllegalArgumentException if either {@code lo} or {@code hi}
     *    is {@code null}
     */
    int size(const K& lo, const K& hi) {
        if (lo == nullptr) return 0;
        if (hi == nullptr) return 0;

        if (hi < lo) return 0;
        if (containsKey(hi)) return rank(hi) - rank(lo) + 1;
        else              return rank(hi) - rank(lo);
    }


    /***************************************************************************
     *  Check integrity of red-black tree data structure.
     ***************************************************************************/
    bool check() {
        if (!isBST())            std::cout << "Not in symmetric order\n";
        if (!isSizeConsistent()) std::cout << "Subtree counts not consistent\n";
        //if (!isRankConsistent()) std::cout << "Ranks not consistent\n";
        if (!is23())             std::cout << "Not a 2-3 tree\n";
        if (!isBalanced())       std::cout << "Not balanced\n";
        return isBST() && isSizeConsistent() && is23() && isBalanced();
    }

    // does this binary tree satisfy symmetric order?
    // Note: this test also ensures that data structure is a binary tree since order is strict
    bool isBST() {
        return isBST(root, nullptr, nullptr);
    }

    // is the tree rooted at x a BST with all keys strictly between min and max
    // (if min or max is null, treat as empty constraint)
    // Credit: Bob Dondero's elegant solution
    bool isBST(Node* x, K* min, K* max) {
        if (x == nullptr) return true;
        // TODO: port these two lines
        //if (min != nullptr && x->key.compareTo(min) <= 0) return false;
        //if (max != nullptr && x->key.compareTo(max) >= 0) return false;
        return isBST(x->left, min, x->key) && isBST(x->right, x->key, max);
    }

    // are the size fields correct?
    bool isSizeConsistent() { return isSizeConsistent(root); }
    bool isSizeConsistent(Node* x) {
        if (x == nullptr) return true;
        if (x->size != size(x->left) + size(x->right) + 1) return false;
        return isSizeConsistent(x->left) && isSizeConsistent(x->right);
    }

    /*
    // check that ranks are consistent
    bool isRankConsistent() {
        for (int i = 0; i < size(); i++)
            if (i != rank(select(i))) return false;
        for (K* key : keys())
            if (key.compareTo(select(rank(key))) != 0) return false;
        return true;
    }
    */

    // Does the tree have no red right links, and at most one (left)
    // red links in a row on any path?
    bool is23() { return is23(root); }
    bool is23(Node* x) {
        if (x == nullptr) return true;
        if (isRed(x->right)) return false;
        if (x != root && isRed(x) && isRed(x->left))
            return false;
        return is23(x->left) && is23(x->right);
    }

    // do all paths from root to leaf have same number of black edges?
    bool isBalanced() {
        int black = 0;     // number of black links on path from root to min
        Node x = root;
        while (x != nullptr) {
            if (!isRed(x)) black++;
            x = x->left;
        }
        return isBalanced(root, black);
    }

    // does every path from the root to a leaf have the given number of black links?
    bool isBalanced(Node* x, int black) {
        if (x == nullptr) return black == 0;
        if (!isRed(x)) black--;
        return isBalanced(x->left, black) && isBalanced(x->right, black);
    }


    // Inserts a key only if it's not already present
    bool add(K key, const int tid=0) {
        return oflf::updateTx<bool>([&] () {
            return innerPut(key,key);
        });
    }

    // Returns true only if the key was present
    bool remove(K key, const int tid=0) {
        return oflf::updateTx<bool>([&] () {
            V notused;
            bool retval = innerGet(key,notused,false);
            if (retval) innerRemove(key);
            return retval;
        });
    }

    bool contains(K key, const int tid=0) {
        return oflf::readTx<bool>([&] () {
            V notused;
            return innerGet(key,notused,false);
        });
    }

    // This is not fully transactionally but it's ok because we use it only on initialization.
    // We could make it fully transactionally, but we would have to increase the size of allocation/store logs.
    void addAll(K** keys, int size, const int tid=0) {
        for (int i = 0; i < size; i++) add(*keys[i], tid);
    }

    static std::string className() { return "OF-LF-RedBlackTree"; }

};

#endif   // _TM_RED_BLACK_BST_H_


================================================
FILE: datastructures/treemaps/OFWFRedBlackTree.hpp
================================================
#ifndef _OF_WF_RED_BLACK_BST_H_
#define _OF_WF_RED_BLACK_BST_H_

#include <cassert>
#include <stdexcept>
#include <algorithm>

#include "stms/OneFileWF.hpp"               // This header defines the macros for the STM being compiled

// Adapted from Java to C++ from the original at http://algs4.cs.princeton.edu/code/edu/princeton/cs/algs4/RedBlackBST.java
template<typename K, typename V>
class OFWFRedBlackTree {
    const int64_t COLOR_RED   = 0;
    const int64_t COLOR_BLACK = 1;

    struct Node : public ofwf::tmbase {
        ofwf::tmtype<K>       key;
        ofwf::tmtype<V>       val;
        ofwf::tmtype<Node*>   left {nullptr};
        ofwf::tmtype<Node*>   right {nullptr};
        ofwf::tmtype<int64_t> color;    // color of parent link
        ofwf::tmtype<int64_t> size;     // subtree count
        Node(const K& key, const V& val, int64_t color, int64_t size) : key{key}, val{val}, color{color}, size{size} {}
    };

    ofwf::tmtype<Node*> root {nullptr};   // root of the BST

    inline void assignAndFreeIfNull(ofwf::tmtype<Node*>& z, Node* w) {
        Node* tofree = z;
        z = w;
        if (w == nullptr) ofwf::tmDelete(tofree);
    }

public:
    /**
     * Initializes an empty symbol table.
     */
    OFWFRedBlackTree(int numThreads=0){ }

    ~OFWFRedBlackTree() {
        for (int i = 0; i < 10000; i++) {
            ofwf::updateTx([&] () {
                if (root == nullptr) return;
                deleteMin();
            });
        }
    }

    /***************************************************************************
     *  Node helper methods.
     ***************************************************************************/
    // is node x red; false if x is null ?
    bool isRed(Node* x) {
        if (x == nullptr) return false;
        return x->color == COLOR_RED;
    }

    // number of node in subtree rooted at x; 0 if x is null
    int size(Node* x) {
        if (x == nullptr) return 0;
        return x->size;
    }


    /**
     * Returns the number of key-value pairs in this symbol table.
     * @return the number of key-value pairs in this symbol table
     */
    int size() {
        return size(root);
    }

    /**
     * Is this symbol table empty?
     * @return {@code true} if this symbol table is empty and {@code false} otherwise
     */
    bool isEmpty() {
        return root == nullptr;
    }


    /***************************************************************************
     *  Standard BST search->
     ***************************************************************************/

    /**
     * Returns the value associated with the given key.
     * @param key the key
     * @return the value associated with the given key if the key is in the symbol table
     *     and {@code null} if the key is not in the symbol table
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    bool innerGet(K key, V& oldValue, const bool saveOldValue) {
        bool found = get(root, key);
        if (!found) return false;
        //if (saveOldValue) oldValue = *val; // Copy of V
        return true;
    }

    // value associated with the given key in subtree rooted at x; null if no such key
    bool get(Node* x, K& key) {
        while (x != nullptr) {
            if      (key < x->key) x = x->left;
            else if (x->key < key) x = x->right;
            else              return true;
        }
        return false;
    }

    /**
     * Does this symbol table contain the given key?
     * @param key the key
     * @return {@code true} if this symbol table contains {@code key} and
     *     {@code false} otherwise
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    bool containsKey(const K& key) {
        return get(key) != nullptr;
    }

    /***************************************************************************
     *  Red-black tree insertion.
     ***************************************************************************/

    /**
     * Inserts the specified key-value pair into the symbol table, overwriting the old
     * value with the new value if the symbol table already contains the specified key.
     * Deletes the specified key (and its associated value) from this symbol table
     * if the specified value is {@code null}.
     *
     * @param key the key
     * @param val the value
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    bool innerPut(const K& key, const V& value) {
    	bool ret = false;
        root = put(root, key, value, ret);
        root->color = COLOR_BLACK;
        return ret;
    }

    // insert the key-value pair in the subtree rooted at h
    Node* put(Node* h, const K& key, const V& val, bool& ret) {
        if (h == nullptr) {
            ret = true;
            return ofwf::tmNew<Node>(key, val, COLOR_RED, 1);
        }
        if      (key < h->key) h->left  = put(h->left,  key, val, ret);
        else if (h->key < key) h->right = put(h->right, key, val, ret);
        else              h->val   = val;
        // fix-up any right-leaning links
        if (isRed(h->right) && !isRed(h->left))       h = rotateLeft(h);
        if (isRed(h->left)  &&  isRed(h->left->left)) h = rotateRight(h);
        if (isRed(h->left)  &&  isRed(h->right))      flipColors(h);
        h->size = size(h->left) + size(h->right) + 1;

        return h;
    }

    /***************************************************************************
     *  Red-black tree deletion.
     ***************************************************************************/

    /**
     * Removes the smallest key and associated value from the symbol table.
     * @throws NoSuchElementException if the symbol table is empty
     */
    void deleteMin() {
        if (isEmpty()) return;
        // if both children of root are black, set root to red
        if (!isRed(root->left) && !isRed(root->right))
            root->color = COLOR_RED;
        assignAndFreeIfNull(root, deleteMin(root));
        if (!isEmpty()) root->color = COLOR_BLACK;
        // assert check();
    }

    // delete the key-value pair with the minimum key rooted at h
    Node* deleteMin(Node* h) {
        if (h->left == nullptr)
            return nullptr;
        if (!isRed(h->left) && !isRed(h->left->left))
            h = moveRedLeft(h);
        assignAndFreeIfNull(h->left, deleteMin(h->left));
        return balance(h);
    }


    /**
     * Removes the largest key and associated value from the symbol table.
     * @throws NoSuchElementException if the symbol table is empty
     */
    void deleteMax() {
        if (isEmpty()) return;

        // if both children of root are black, set root to red
        if (!isRed(root->left) && !isRed(root->right))
            root->color = COLOR_RED;

        root = deleteMax(root);
        if (!isEmpty()) root->color = COLOR_BLACK;
        // assert check();
    }

    // delete the key-value pair with the maximum key rooted at h
    Node* deleteMax(Node* h) {
        if (isRed(h->left))
            h = rotateRight(h);

        if (h->right == nullptr)
            return nullptr;

        if (!isRed(h->right) && !isRed(h->right->left))
            h = moveRedRight(h);

        h->right = deleteMax(h->right);

        return balance(h);
    }

    /**
     * Removes the specified key and its associated value from this symbol table
     * (if the key is in this symbol table).
     *
     * @param  key the key
     */
    void innerRemove(K key) {
        // if both children of root are black, set root to red
        if (!isRed(root->left) && !isRed(root->right)) root->color = COLOR_RED;
        assignAndFreeIfNull(root, deleteKey(root, key));
        if (!isEmpty()) root->color = COLOR_BLACK;
        // assert check();
    }

    // delete the key-value pair with the given key rooted at h
    Node* deleteKey(Node* h, const K& key) {
        // assert get(h, key) != null;
        if (key < h->key)  {
            if (!isRed(h->left) && !isRed(h->left->left)) {
                h = moveRedLeft(h);
            }
            assignAndFreeIfNull(h->left, deleteKey(h->left, key));
        } else {
            if (isRed(h->left)) {
                h = rotateRight(h);
            }
            if (key == h->key && (h->right == nullptr)) {
                return nullptr;
            }
            if (!isRed(h->right) && !isRed(h->right->left)) {
                h = moveRedRight(h);
            }
            if (key == h->key) {
                Node* x = min(h->right);
                h->key = x->key;
                h->val = x->val;
                // h->val = get(h->right, min(h->right).key);
                // h->key = min(h->right).key;
                assignAndFreeIfNull(h->right, deleteMin(h->right));
            } else {
                assignAndFreeIfNull(h->right, deleteKey(h->right, key));
            }
        }
        return balance(h);
    }

    /***************************************************************************
     *  Red-black tree helper functions.
     ***************************************************************************/

    // make a left-leaning link lean to the right
    Node* rotateRight(Node* h) {
        // assert (h != null) && isRed(h->left);
        Node* x = h->left;
        h->left = x->right;
        x->right = h;
        x->color = x->right->color;
        x->right->color = COLOR_RED;
        x->size = h->size;
        h->size = size(h->left) + size(h->right) + 1;
        return x;
    }

    // make a right-leaning link lean to the left
    Node* rotateLeft(Node* h) {
        // assert (h != null) && isRed(h->right);
        Node* x = h->right;
        h->right = x->left;
        x->left = h;
        x->color = x->left->color;
        x->left->color = COLOR_RED;
        x->size = h->size;
        h->size = size(h->left) + size(h->right) + 1;
        return x;
    }

    // flip the colors of a node and its two children
    void flipColors(Node* h) {
        // h must have opposite color of its two children
        // assert (h != null) && (h->left != null) && (h->right != null);
        // assert (!isRed(h) &&  isRed(h->left) &&  isRed(h->right))
        //    || (isRed(h)  && !isRed(h->left) && !isRed(h->right));
        h->color = !h->color;
        h->left->color = !h->left->color;
        h->right->color = !h->right->color;
    }

    // Assuming that h is red and both h->left and h->left.left
    // are black, make h->left or one of its children red.
    Node* moveRedLeft(Node* h) {
        // assert (h != null);
        // assert isRed(h) && !isRed(h->left) && !isRed(h->left.left);

        flipColors(h);
        if (isRed(h->right->left)) {
            h->right = rotateRight(h->right);
            h = rotateLeft(h);
            flipColors(h);
        }
        return h;
    }

    // Assuming that h is red and both h->right and h->right.left
    // are black, make h->right or one of its children red.
    Node* moveRedRight(Node* h) {
        // assert (h != null);
        // assert isRed(h) && !isRed(h->right) && !isRed(h->right.left);
        flipColors(h);
        if (isRed(h->left->left)) {
            h = rotateRight(h);
            flipColors(h);
        }
        return h;
    }

    // restore red-black tree invariant
    Node* balance(Node* h) {
        // assert (h != null);

        if (isRed(h->right))                        h = rotateLeft(h);
        if (isRed(h->left) && isRed(h->left->left)) h = rotateRight(h);
        if (isRed(h->left) && isRed(h->right))      flipColors(h);

        h->size = size(h->left) + size(h->right) + 1;
        return h;
    }


    /***************************************************************************
     *  Utility functions.
     ***************************************************************************/

    /**
     * Returns the height of the BST (for debugging).
     * @return the height of the BST (a 1-node tree has height 0)
     */
    int height() {
        return height(root);
    }
    int height(Node* x) {
        if (x == nullptr) return -1;
        return 1 + std::max(height(x->left), height(x->right));
    }

    /***************************************************************************
     *  Ordered symbol table methods.
     ***************************************************************************/

    /**
     * Returns the smallest key in the symbol table.
     * @return the smallest key in the symbol table
     * @throws NoSuchElementException if the symbol table is empty
     */
    K* min() {
        if (isEmpty()) return nullptr;
        return min(root).key;
    }

    // the smallest key in subtree rooted at x; null if no such key
    Node* min(Node* x) {
        // assert x != null;
        if (x->left == nullptr) return x;
        else                return min(x->left);
    }

    /**
     * Returns the largest key in the symbol table.
     * @return the largest key in the symbol table
     * @throws NoSuchElementException if the symbol table is empty
     */
    K* max() {
        if (isEmpty()) return nullptr;
        return max(root).key;
    }

    // the largest key in the subtree rooted at x; null if no such key
    Node* max(Node* x) {
        // assert x != null;
        if (x->right == nullptr) return x;
        else                 return max(x->right);
    }


    /**
     * Returns the largest key in the symbol table less than or equal to {@code key}.
     * @param key the key
     * @return the largest key in the symbol table less than or equal to {@code key}
     * @throws NoSuchElementException if there is no such key
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    K* floor(const K& key) {
        if (key == nullptr) return nullptr;
        if (isEmpty()) return nullptr;
        Node* x = floor(root, key);
        if (x == nullptr) return nullptr;
        else           return x->key;
    }

    // the largest key in the subtree rooted at x less than or equal to the given key
    Node* floor(Node* x, const K& key) {
        if (x == nullptr) return nullptr;
        if (key == x->key) return x;
        if (key < x->key)  return floor(x->left, key);
        Node* t = floor(x->right, key);
        if (t != nullptr) return t;
        else           return x;
    }

    /**
     * Returns the smallest key in the symbol table greater than or equal to {@code key}.
     * @param key the key
     * @return the smallest key in the symbol table greater than or equal to {@code key}
     * @throws NoSuchElementException if there is no such key
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    K* ceiling(const K& key) {
        if (key == nullptr) return nullptr;
        if (isEmpty()) return nullptr;
        Node* x = ceiling(root, key);
        if (x == nullptr) return nullptr;
        else           return x->key;
    }

    // the smallest key in the subtree rooted at x greater than or equal to the given key
    Node* ceiling(Node* x, const K& key) {
        if (x == nullptr) return nullptr;
        if (key == x->key) return x;
        if (x->key < key)  return ceiling(x->right, key);
        Node* t = ceiling(x->left, key);
        if (t != nullptr) return t;
        else           return x;
    }

    /**
     * Return the kth smallest key in the symbol table.
     * @param k the order statistic
     * @return the {@code k}th smallest key in the symbol table
     * @throws IllegalArgumentException unless {@code k} is between 0 and
     *     <em>n</em>1
     */
    K* select(int k) {
        if (k < 0 || k >= size()) {
            return nullptr;
        }
        Node x = select(root, k);
        return x->key;
    }

    // the key of rank k in the subtree rooted at x
    Node* select(Node* x, int k) {
        // assert x != null;
        // assert k >= 0 && k < size(x);
        int t = size(x->left);
        if      (t > k) return select(x->left,  k);
        else if (t < k) return select(x->right, k-t-1);
        else            return x;
    }

    /**
     * Return the number of keys in the symbol table strictly less than {@code key}.
     * @param key the key
     * @return the number of keys in the symbol table strictly less than {@code key}
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    int rank(const K& key) {
        if (key == nullptr) return -1;
        return rank(key, root);
    }

    // number of keys less than key in the subtree rooted at x
    int rank(const K& key, Node* x) {
        if (x == nullptr) return 0;
        if      (key < x->key) return rank(key, x->left);
        else if (x->key < key) return 1 + size(x->left) + rank(key, x->right);
        else              return size(x->left);
    }

    /***************************************************************************
     *  Range count and range search->
     ***************************************************************************/


    /**
     * Returns the number of keys in the symbol table in the given range.
     *
     * @param  lo minimum endpoint
     * @param  hi maximum endpoint
     * @return the number of keys in the sybol table between {@code lo}
     *    (inclusive) and {@code hi} (inclusive)
     * @throws IllegalArgumentException if either {@code lo} or {@code hi}
     *    is {@code null}
     */
    int size(const K& lo, const K& hi) {
        if (lo == nullptr) return 0;
        if (hi == nullptr) return 0;

        if (hi < lo) return 0;
        if (containsKey(hi)) return rank(hi) - rank(lo) + 1;
        else              return rank(hi) - rank(lo);
    }


    /***************************************************************************
     *  Check integrity of red-black tree data structure.
     ***************************************************************************/
    bool check() {
        if (!isBST())            std::cout << "Not in symmetric order\n";
        if (!isSizeConsistent()) std::cout << "Subtree counts not consistent\n";
        //if (!isRankConsistent()) std::cout << "Ranks not consistent\n";
        if (!is23())             std::cout << "Not a 2-3 tree\n";
        if (!isBalanced())       std::cout << "Not balanced\n";
        return isBST() && isSizeConsistent() && is23() && isBalanced();
    }

    // does this binary tree satisfy symmetric order?
    // Note: this test also ensures that data structure is a binary tree since order is strict
    bool isBST() {
        return isBST(root, nullptr, nullptr);
    }

    // is the tree rooted at x a BST with all keys strictly between min and max
    // (if min or max is null, treat as empty constraint)
    // Credit: Bob Dondero's elegant solution
    bool isBST(Node* x, K* min, K* max) {
        if (x == nullptr) return true;
        // TODO: port these two lines
        //if (min != nullptr && x->key.compareTo(min) <= 0) return false;
        //if (max != nullptr && x->key.compareTo(max) >= 0) return false;
        return isBST(x->left, min, x->key) && isBST(x->right, x->key, max);
    }

    // are the size fields correct?
    bool isSizeConsistent() { return isSizeConsistent(root); }
    bool isSizeConsistent(Node* x) {
        if (x == nullptr) return true;
        if (x->size != size(x->left) + size(x->right) + 1) return false;
        return isSizeConsistent(x->left) && isSizeConsistent(x->right);
    }

    /*
    // check that ranks are consistent
    bool isRankConsistent() {
        for (int i = 0; i < size(); i++)
            if (i != rank(select(i))) return false;
        for (K* key : keys())
            if (key.compareTo(select(rank(key))) != 0) return false;
        return true;
    }
    */

    // Does the tree have no red right links, and at most one (left)
    // red links in a row on any path?
    bool is23() { return is23(root); }
    bool is23(Node* x) {
        if (x == nullptr) return true;
        if (isRed(x->right)) return false;
        if (x != root && isRed(x) && isRed(x->left))
            return false;
        return is23(x->left) && is23(x->right);
    }

    // do all paths from root to leaf have same number of black edges?
    bool isBalanced() {
        int black = 0;     // number of black links on path from root to min
        Node x = root;
        while (x != nullptr) {
            if (!isRed(x)) black++;
            x = x->left;
        }
        return isBalanced(root, black);
    }

    // does every path from the root to a leaf have the given number of black links?
    bool isBalanced(Node* x, int black) {
        if (x == nullptr) return black == 0;
        if (!isRed(x)) black--;
        return isBalanced(x->left, black) && isBalanced(x->right, black);
    }


    // Inserts a key only if it's not already present
    bool add(K key, const int tid=0) {
        return ofwf::updateTx<bool>([=] () {
            return innerPut(key,key);
        });
    }

    // Returns true only if the key was present
    bool remove(K key, const int tid=0) {
        return ofwf::updateTx<bool>([=] () {
            V notused;
            bool retval = innerGet(key,notused,false);
            if (retval) innerRemove(key);
            return retval;
        });
    }

    bool contains(K key, const int tid=0) {
        return ofwf::readTx<bool>([=] () {
            V notused;
            return innerGet(key,notused,false);
        });
    }

    void addAll(K** keys, int size, const int tid=0) {
        for (int i = 0; i < size; i++) add(*keys[i], tid);
    }

    static std::string className() { return "OF-WF-RedBlackTree"; }

};

#endif   // _OF_WF_RED_BLACK_BST_H_


================================================
FILE: datastructures/treemaps/TinySTMRedBlackTree.hpp
================================================
#ifndef _TINY_STM_RED_BLACK_BST_H_
#define _TINY_STM_RED_BLACK_BST_H_

#include <cassert>
#include <stdexcept>
#include <algorithm>

#include "stms/TinySTM.hpp"

// Adapted from Java to C++ from the original at http://algs4.cs.princeton.edu/code/edu/princeton/cs/algs4/RedBlackBST.java
template<typename K, typename V>
class TinySTMRedBlackTree {
    const int64_t COLOR_RED   = 0;
    const int64_t COLOR_BLACK = 1;

    struct Node {
        tinystm::tmtype<K>       key;
        tinystm::tmtype<V>       val;
        tinystm::tmtype<Node*>   left {nullptr};
        tinystm::tmtype<Node*>   right {nullptr};
        tinystm::tmtype<int64_t> color;    // color of parent link
        tinystm::tmtype<int64_t> size;     // subtree count
        Node(const K& key, const V& val, int64_t color, int64_t size) : key{key}, val{val}, color{color}, size{size} {}
    };

    tinystm::tmtype<Node*> root {nullptr};   // root of the BST

    inline void assignAndFreeIfNull(tinystm::tmtype<Node*>& z, Node* w) {
        Node* tofree = z;
        z = w;
        if (w == nullptr) tinystm::tmDelete(tofree);
    }

public:
    /**
     * Initializes an empty symbol table.
     */
    TinySTMRedBlackTree(int maxThreads=0){ }

    ~TinySTMRedBlackTree() {
        // The transaction log is not enough to delete everything if there are too many, so we delete 1000 per transaction
        for (int i = 0; i < 1000; i++) {
            tinystm::updateTx<bool>([&] () {
                if (root == nullptr) return true;
                deleteMin();
                return true;
            });
        }
    }

    /***************************************************************************
     *  Node helper methods.
     ***************************************************************************/
    // is node x red; false if x is null ?
    bool isRed(Node* x) {
        if (x == nullptr) return false;
        return x->color == COLOR_RED;
    }

    // number of node in subtree rooted at x; 0 if x is null
    int size(Node* x) {
        if (x == nullptr) return 0;
        return x->size;
    }


    /**
     * Returns the number of key-value pairs in this symbol table.
     * @return the number of key-value pairs in this symbol table
     */
    int size() {
        return size(root);
    }

    /**
     * Is this symbol table empty?
     * @return {@code true} if this symbol table is empty and {@code false} otherwise
     */
    bool isEmpty() {
        return root == nullptr;
    }


    /***************************************************************************
     *  Standard BST search->
     ***************************************************************************/

    /**
     * Returns the value associated with the given key.
     * @param key the key
     * @return the value associated with the given key if the key is in the symbol table
     *     and {@code null} if the key is not in the symbol table
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    bool innerGet(K& key, V& oldValue, const bool saveOldValue) {
        bool found = get(root, key);
        if (!found) return false;
        //if (saveOldValue) oldValue = *val; // Copy of V
        return true;
    }

    // value associated with the given key in subtree rooted at x; null if no such key
    bool get(Node* x, K& key) {
        while (x != nullptr) {
            if      (key < x->key) x = x->left;
            else if (x->key < key) x = x->right;
            else              return true;
        }
        return false;
    }

    /**
     * Does this symbol table contain the given key?
     * @param key the key
     * @return {@code true} if this symbol table contains {@code key} and
     *     {@code false} otherwise
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    bool containsKey(const K& key) {
        return get(key) != nullptr;
    }

    /***************************************************************************
     *  Red-black tree insertion.
     ***************************************************************************/

    /**
     * Inserts the specified key-value pair into the symbol table, overwriting the old
     * value with the new value if the symbol table already contains the specified key.
     * Deletes the specified key (and its associated value) from this symbol table
     * if the specified value is {@code null}.
     *
     * @param key the key
     * @param val the value
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    bool innerPut(const K& key, const V& value) {
    	bool ret = false;
        root = put(root, key, value, ret);
        root->color = COLOR_BLACK;
        return ret;
    }

    // insert the key-value pair in the subtree rooted at h
    Node* put(Node* h, const K& key, const V& val, bool& ret) {
        if (h == nullptr) {
        	ret = true;
        	return tinystm::tmNew<Node>(key, val, COLOR_RED, 1);
        }
        if      (key < h->key) h->left  = put(h->left,  key, val, ret);
        else if (h->key < key) h->right = put(h->right, key, val, ret);
        else              h->val   = val;
        // fix-up any right-leaning links
        if (isRed(h->right) && !isRed(h->left))       h = rotateLeft(h);
        if (isRed(h->left)  &&  isRed(h->left->left)) h = rotateRight(h);
        if (isRed(h->left)  &&  isRed(h->right))      flipColors(h);
        h->size = size(h->left) + size(h->right) + 1;

        return h;
    }

    /***************************************************************************
     *  Red-black tree deletion.
     ***************************************************************************/

    /**
     * Removes the smallest key and associated value from the symbol table.
     * @throws NoSuchElementException if the symbol table is empty
     */
    void deleteMin() {
        if (isEmpty()) return;
        // if both children of root are black, set root to red
        if (!isRed(root->left) && !isRed(root->right))
            root->color = COLOR_RED;
        assignAndFreeIfNull(root, deleteMin(root));
        if (!isEmpty()) root->color = COLOR_BLACK;
        // assert check();
    }

    // delete the key-value pair with the minimum key rooted at h
    Node* deleteMin(Node* h) {
        if (h->left == nullptr)
            return nullptr;
        if (!isRed(h->left) && !isRed(h->left->left))
            h = moveRedLeft(h);
        assignAndFreeIfNull(h->left, deleteMin(h->left));
        return balance(h);
    }


    /**
     * Removes the largest key and associated value from the symbol table.
     * @throws NoSuchElementException if the symbol table is empty
     */
    void deleteMax() {
        if (isEmpty()) return;

        // if both children of root are black, set root to red
        if (!isRed(root->left) && !isRed(root->right))
            root->color = COLOR_RED;

        root = deleteMax(root);
        if (!isEmpty()) root->color = COLOR_BLACK;
        // assert check();
    }

    // delete the key-value pair with the maximum key rooted at h
    Node* deleteMax(Node* h) {
        if (isRed(h->left))
            h = rotateRight(h);

        if (h->right == nullptr)
            return nullptr;

        if (!isRed(h->right) && !isRed(h->right->left))
            h = moveRedRight(h);

        h->right = deleteMax(h->right);

        return balance(h);
    }

    /**
     * Removes the specified key and its associated value from this symbol table
     * (if the key is in this symbol table).
     *
     * @param  key the key
     */
    void innerRemove(const K& key) {
        // if both children of root are black, set root to red
        if (!isRed(root->left) && !isRed(root->right)) root->color = COLOR_RED;
        assignAndFreeIfNull(root, deleteKey(root, key));
        if (!isEmpty()) root->color = COLOR_BLACK;
        // assert check();
    }

    // delete the key-value pair with the given key rooted at h
    Node* deleteKey(Node* h, const K& key) {
        // assert get(h, key) != null;
        if (key < h->key)  {
            if (!isRed(h->left) && !isRed(h->left->left)) {
                h = moveRedLeft(h);
            }
            assignAndFreeIfNull(h->left, deleteKey(h->left, key));
        } else {
            if (isRed(h->left)) {
                h = rotateRight(h);
            }
            if (key == h->key && (h->right == nullptr)) {
                return nullptr;
            }
            if (!isRed(h->right) && !isRed(h->right->left)) {
                h = moveRedRight(h);
            }
            if (key == h->key) {
                Node* x = min(h->right);
                h->key = x->key;
                h->val = x->val;
                // h->val = get(h->right, min(h->right).key);
                // h->key = min(h->right).key;
                assignAndFreeIfNull(h->right, deleteMin(h->right));
            } else {
                assignAndFreeIfNull(h->right, deleteKey(h->right, key));
            }
        }
        return balance(h);
    }

    /***************************************************************************
     *  Red-black tree helper functions.
     ***************************************************************************/

    // make a left-leaning link lean to the right
    Node* rotateRight(Node* h) {
        // assert (h != null) && isRed(h->left);
        Node* x = h->left;
        h->left = x->right;
        x->right = h;
        x->color = x->right->color;
        x->right->color = COLOR_RED;
        x->size = h->size;
        h->size = size(h->left) + size(h->right) + 1;
        return x;
    }

    // make a right-leaning link lean to the left
    Node* rotateLeft(Node* h) {
        // assert (h != null) && isRed(h->right);
        Node* x = h->right;
        h->right = x->left;
        x->left = h;
        x->color = x->left->color;
        x->left->color = COLOR_RED;
        x->size = h->size;
        h->size = size(h->left) + size(h->right) + 1;
        return x;
    }

    // flip the colors of a node and its two children
    void flipColors(Node* h) {
        // h must have opposite color of its two children
        // assert (h != null) && (h->left != null) && (h->right != null);
        // assert (!isRed(h) &&  isRed(h->left) &&  isRed(h->right))
        //    || (isRed(h)  && !isRed(h->left) && !isRed(h->right));
        h->color = !h->color;
        h->left->color = !h->left->color;
        h->right->color = !h->right->color;
    }

    // Assuming that h is red and both h->left and h->left.left
    // are black, make h->left or one of its children red.
    Node* moveRedLeft(Node* h) {
        // assert (h != null);
        // assert isRed(h) && !isRed(h->left) && !isRed(h->left.left);

        flipColors(h);
        if (isRed(h->right->left)) {
            h->right = rotateRight(h->right);
            h = rotateLeft(h);
            flipColors(h);
        }
        return h;
    }

    // Assuming that h is red and both h->right and h->right.left
    // are black, make h->right or one of its children red.
    Node* moveRedRight(Node* h) {
        // assert (h != null);
        // assert isRed(h) && !isRed(h->right) && !isRed(h->right.left);
        flipColors(h);
        if (isRed(h->left->left)) {
            h = rotateRight(h);
            flipColors(h);
        }
        return h;
    }

    // restore red-black tree invariant
    Node* balance(Node* h) {
        // assert (h != null);

        if (isRed(h->right))                        h = rotateLeft(h);
        if (isRed(h->left) && isRed(h->left->left)) h = rotateRight(h);
        if (isRed(h->left) && isRed(h->right))      flipColors(h);

        h->size = size(h->left) + size(h->right) + 1;
        return h;
    }


    /***************************************************************************
     *  Utility functions.
     ***************************************************************************/

    /**
     * Returns the height of the BST (for debugging).
     * @return the height of the BST (a 1-node tree has height 0)
     */
    int height() {
        return height(root);
    }
    int height(Node* x) {
        if (x == nullptr) return -1;
        return 1 + std::max(height(x->left), height(x->right));
    }

    /***************************************************************************
     *  Ordered symbol table methods.
     ***************************************************************************/

    /**
     * Returns the smallest key in the symbol table.
     * @return the smallest key in the symbol table
     * @throws NoSuchElementException if the symbol table is empty
     */
    K* min() {
        if (isEmpty()) return nullptr;
        return min(root).key;
    }

    // the smallest key in subtree rooted at x; null if no such key
    Node* min(Node* x) {
        // assert x != null;
        if (x->left == nullptr) return x;
        else                return min(x->left);
    }

    /**
     * Returns the largest key in the symbol table.
     * @return the largest key in the symbol table
     * @throws NoSuchElementException if the symbol table is empty
     */
    K* max() {
        if (isEmpty()) return nullptr;
        return max(root).key;
    }

    // the largest key in the subtree rooted at x; null if no such key
    Node* max(Node* x) {
        // assert x != null;
        if (x->right == nullptr) return x;
        else                 return max(x->right);
    }


    /**
     * Returns the largest key in the symbol table less than or equal to {@code key}.
     * @param key the key
     * @return the largest key in the symbol table less than or equal to {@code key}
     * @throws NoSuchElementException if there is no such key
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    K* floor(const K& key) {
        if (key == nullptr) return nullptr;
        if (isEmpty()) return nullptr;
        Node* x = floor(root, key);
        if (x == nullptr) return nullptr;
        else           return x->key;
    }

    // the largest key in the subtree rooted at x less than or equal to the given key
    Node* floor(Node* x, const K& key) {
        if (x == nullptr) return nullptr;
        if (key == x->key) return x;
        if (key < x->key)  return floor(x->left, key);
        Node* t = floor(x->right, key);
        if (t != nullptr) return t;
        else           return x;
    }

    /**
     * Returns the smallest key in the symbol table greater than or equal to {@code key}.
     * @param key the key
     * @return the smallest key in the symbol table greater than or equal to {@code key}
     * @throws NoSuchElementException if there is no such key
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    K* ceiling(const K& key) {
        if (key == nullptr) return nullptr;
        if (isEmpty()) return nullptr;
        Node* x = ceiling(root, key);
        if (x == nullptr) return nullptr;
        else           return x->key;
    }

    // the smallest key in the subtree rooted at x greater than or equal to the given key
    Node* ceiling(Node* x, const K& key) {
        if (x == nullptr) return nullptr;
        if (key == x->key) return x;
        if (x->key < key)  return ceiling(x->right, key);
        Node* t = ceiling(x->left, key);
        if (t != nullptr) return t;
        else           return x;
    }

    /**
     * Return the kth smallest key in the symbol table.
     * @param k the order statistic
     * @return the {@code k}th smallest key in the symbol table
     * @throws IllegalArgumentException unless {@code k} is between 0 and
     *     <em>n</em>1
     */
    K* select(int k) {
        if (k < 0 || k >= size()) {
            return nullptr;
        }
        Node x = select(root, k);
        return x->key;
    }

    // the key of rank k in the subtree rooted at x
    Node* select(Node* x, int k) {
        // assert x != null;
        // assert k >= 0 && k < size(x);
        int t = size(x->left);
        if      (t > k) return select(x->left,  k);
        else if (t < k) return select(x->right, k-t-1);
        else            return x;
    }

    /**
     * Return the number of keys in the symbol table strictly less than {@code key}.
     * @param key the key
     * @return the number of keys in the symbol table strictly less than {@code key}
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    int rank(const K& key) {
        if (key == nullptr) return -1;
        return rank(key, root);
    }

    // number of keys less than key in the subtree rooted at x
    int rank(const K& key, Node* x) {
        if (x == nullptr) return 0;
        if      (key < x->key) return rank(key, x->left);
        else if (x->key < key) return 1 + size(x->left) + rank(key, x->right);
        else              return size(x->left);
    }

    /***************************************************************************
     *  Range count and range search->
     ***************************************************************************/


    /**
     * Returns the number of keys in the symbol table in the given range.
     *
     * @param  lo minimum endpoint
     * @param  hi maximum endpoint
     * @return the number of keys in the sybol table between {@code lo}
     *    (inclusive) and {@code hi} (inclusive)
     * @throws IllegalArgumentException if either {@code lo} or {@code hi}
     *    is {@code null}
     */
    int size(const K& lo, const K& hi) {
        if (lo == nullptr) return 0;
        if (hi == nullptr) return 0;

        if (hi < lo) return 0;
        if (containsKey(hi)) return rank(hi) - rank(lo) + 1;
        else              return rank(hi) - rank(lo);
    }


    /***************************************************************************
     *  Check integrity of red-black tree data structure.
     ***************************************************************************/
    bool check() {
        if (!isBST())            std::cout << "Not in symmetric order\n";
        if (!isSizeConsistent()) std::cout << "Subtree counts not consistent\n";
        //if (!isRankConsistent()) std::cout << "Ranks not consistent\n";
        if (!is23())             std::cout << "Not a 2-3 tree\n";
        if (!isBalanced())       std::cout << "Not balanced\n";
        return isBST() && isSizeConsistent() && is23() && isBalanced();
    }

    // does this binary tree satisfy symmetric order?
    // Note: this test also ensures that data structure is a binary tree since order is strict
    bool isBST() {
        return isBST(root, nullptr, nullptr);
    }

    // is the tree rooted at x a BST with all keys strictly between min and max
    // (if min or max is null, treat as empty constraint)
    // Credit: Bob Dondero's elegant solution
    bool isBST(Node* x, K* min, K* max) {
        if (x == nullptr) return true;
        // TODO: port these two lines
        //if (min != nullptr && x->key.compareTo(min) <= 0) return false;
        //if (max != nullptr && x->key.compareTo(max) >= 0) return false;
        return isBST(x->left, min, x->key) && isBST(x->right, x->key, max);
    }

    // are the size fields correct?
    bool isSizeConsistent() { return isSizeConsistent(root); }
    bool isSizeConsistent(Node* x) {
        if (x == nullptr) return true;
        if (x->size != size(x->left) + size(x->right) + 1) return false;
        return isSizeConsistent(x->left) && isSizeConsistent(x->right);
    }

    /*
    // check that ranks are consistent
    bool isRankConsistent() {
        for (int i = 0; i < size(); i++)
            if (i != rank(select(i))) return false;
        for (K* key : keys())
            if (key.compareTo(select(rank(key))) != 0) return false;
        return true;
    }
    */

    // Does the tree have no red right links, and at most one (left)
    // red links in a row on any path?
    bool is23() { return is23(root); }
    bool is23(Node* x) {
        if (x == nullptr) return true;
        if (isRed(x->right)) return false;
        if (x != root && isRed(x) && isRed(x->left))
            return false;
        return is23(x->left) && is23(x->right);
    }

    // do all paths from root to leaf have same number of black edges?
    bool isBalanced() {
        int black = 0;     // number of black links on path from root to min
        Node x = root;
        while (x != nullptr) {
            if (!isRed(x)) black++;
            x = x->left;
        }
        return isBalanced(root, black);
    }

    // does every path from the root to a leaf have the given number of black links?
    bool isBalanced(Node* x, int black) {
        if (x == nullptr) return black == 0;
        if (!isRed(x)) black--;
        return isBalanced(x->left, black) && isBalanced(x->right, black);
    }


    // Inserts a key only if it's not already present
    bool add(K key, const int tid=0) {
        return tinystm::updateTx<bool>([&] () {
            return innerPut(key,key);
        });
    }

    // Returns true only if the key was present
    bool remove(K key, const int tid=0) {
        return tinystm::updateTx<bool>([&] () {
            V notused;
            bool retval = innerGet(key,notused,false);
            if (retval) innerRemove(key);
            return retval;
        });
    }

    bool contains(K key, const int tid=0) {
        return tinystm::readTx<bool>([&] () {
            V notused;
            return innerGet(key,notused,false);
        });
    }

    // This is not fully transactionally but it's ok because we use it only on initialization.
    // We could make it fully transactionally, but we would have to increase the size of allocation/store logs.
    void addAll(K** keys, int size, const int tid=0) {
        for (int i = 0; i < size; i++) add(*keys[i], tid);
    }

    static std::string className() { return tinystm::TinySTM::className() + "-RedBlackTree"; }

};

#endif   // _TINY_STM_RED_BLACK_BST_H_


================================================
FILE: datastructures/trevor_brown_abtree/Makefile
================================================
GPP = g++
FLAGS = -std=c++11 -mcx16 -O3 -g
FLAGS += -DPHYSICAL_PROCESSORS=48 -DMAX_TID_POW2=64

LDFLAGS += -I./common
LDFLAGS += -I./common/dcss
LDFLAGS += -I./common/atomic_ops
LDFLAGS += -I./common/descriptors
LDFLAGS += -I./common/recordmgr
LDFLAGS += -I./common/rq
LDFLAGS += -I./common/rq/snapcollector
LDFLAGS += -I./ds/brown_ext_abtree_lf
LDFLAGS += -lpthread

all: minimal_example

minimal_example:
	$(GPP) $(FLAGS) -o $@.out $@.cpp $(LDFLAGS)


================================================
FILE: datastructures/trevor_brown_abtree/TrevorBrownABTree.hpp
================================================
#ifndef _TREVOR_BROWN_AB_TREE_HP_H_
#define _TREVOR_BROWN_AB_TREE_HP_H_

#include <cassert>
#include <stdexcept>
#include <algorithm>
#include "common/ThreadRegistry.hpp"
#include "ds/brown_ext_abtree_lf/brown_ext_abtree_lf_adapter.h"

/*
 * This is a wrapper to Trevor Brown's AB-Tree so we can use it in our benchmarks
 * TODO: We've enabled Hazard Pointers as memory reclamation
 */

template<typename K>
class TrevorBrownABTree {

    static const int NODE_DEGREE = 16;
    const int ANY_KEY = 0;
    const int NUM_THREADS = 128;

    //ds_adapter<NODE_DEGREE, K, reclaimer_hazardptr<K>>* tree;
    ds_adapter<NODE_DEGREE, K>* tree;

public:
    TrevorBrownABTree(int numThreads) {
        //tree = new ds_adapter<NODE_DEGREE, K, reclaimer_hazardptr<K>>(NUM_THREADS, ANY_KEY);
        tree = new ds_adapter<NODE_DEGREE, K>(NUM_THREADS, ANY_KEY);
    }

    ~TrevorBrownABTree() {
        delete tree;
    }

    // Inserts a key only if it's not already present
    bool add(K key, const int tid=0) {
        int threadID = tl_tcico.tid;
        if (threadID == ThreadCheckInCheckOut::NOT_ASSIGNED) {
            threadID = ThreadRegistry::getTID();
            tree->initThread(threadID);
        }
        return tree->insert(threadID, key, (void *) 1) != tree->getNoValue();
    }

    // Returns true only if the key was present
    bool remove(K key, const int tid=0) {
        int threadID = tl_tcico.tid;
        if (threadID == ThreadCheckInCheckOut::NOT_ASSIGNED) {
            threadID = ThreadRegistry::getTID();
            tree->initThread(threadID);
        }
        return tree->erase(threadID, key) != tree->getNoValue();
    }

    bool contains(K key, const int tid=0) {
        int threadID = tl_tcico.tid;
        if (threadID == ThreadCheckInCheckOut::NOT_ASSIGNED) {
            threadID = ThreadRegistry::getTID();
            tree->initThread(threadID);
        }
        return tree->contains(threadID, key);
    }

    // This is not fully transactionally but it's ok because we use it only on initialization.
    // We could make it fully transactionally, but we would have to increase the size of allocation/store logs.
    void addAll(K** keys, int size, const int tid=0) {
        for (int i = 0; i < size; i++) add(*keys[i], tid);
    }

    static std::string className() { return "TrevorBrown-AB-Tree"; }

};

#endif   // _TREVOR_BROWN_AB_TREE_HP_H_


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/generalize-small.h
================================================
/* char_load */
#if defined(AO_HAVE_char_load_acquire) && !defined(AO_HAVE_char_load)
#  define AO_char_load(addr) AO_char_load_acquire(addr)
#  define AO_HAVE_char_load
#endif

#if defined(AO_HAVE_char_load_full) && !defined(AO_HAVE_char_load_acquire)
#  define AO_char_load_acquire(addr) AO_char_load_full(addr)
#  define AO_HAVE_char_load_acquire
#endif

#if defined(AO_HAVE_char_load_full) && !defined(AO_HAVE_char_load_read)
#  define AO_char_load_read(addr) AO_char_load_full(addr)
#  define AO_HAVE_char_load_read
#endif

#if !defined(AO_HAVE_char_load_acquire_read) && defined(AO_HAVE_char_load_acquire)
#  define AO_char_load_acquire_read(addr) AO_char_load_acquire(addr)
#  define AO_HAVE_char_load_acquire_read
#endif

#if defined(AO_HAVE_char_load) && defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_char_load_acquire)
   AO_INLINE unsigned char
   AO_char_load_acquire(const volatile unsigned char *addr)
   {
     unsigned char result = AO_char_load(addr);
     /* Acquire barrier would be useless, since the load could be delayed  */
     /* beyond it.                                                         */
     AO_nop_full();
     return result;
   }
#  define AO_HAVE_char_load_acquire
#endif

#if defined(AO_HAVE_char_load) && defined(AO_HAVE_nop_read) && \
    !defined(AO_HAVE_char_load_read)
   AO_INLINE unsigned char
   AO_char_load_read(const volatile unsigned char *addr)
   {
     unsigned char result = AO_char_load(addr);
     /* Acquire barrier would be useless, since the load could be delayed  */
     /* beyond it.                                                         */
     AO_nop_read();
     return result;
   }
#  define AO_HAVE_char_load_read
#endif

#if defined(AO_HAVE_char_load_acquire) && defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_char_load_full)
#  define AO_char_load_full(addr) (AO_nop_full(), AO_char_load_acquire(addr))
#  define AO_HAVE_char_load_full
#endif

#if !defined(AO_HAVE_char_load_acquire_read) && defined(AO_HAVE_char_load_read)
#  define AO_char_load_acquire_read(addr) AO_char_load_read(addr)
#  define AO_HAVE_char_load_acquire_read
#endif

#if defined(AO_HAVE_char_load_acquire_read) && !defined(AO_HAVE_char_load)
#  define AO_char_load(addr) AO_char_load_acquire_read(addr)
#  define AO_HAVE_char_load
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_char_load_acquire_read)
#    define AO_char_load_dd_acquire_read(addr) \
        AO_char_load_acquire_read(addr)
#    define AO_HAVE_char_load_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_char_load)
#    define AO_char_load_dd_acquire_read(addr) \
        AO_char_load(addr)
#    define AO_HAVE_char_load_dd_acquire_read
#  endif
#endif


/* char_store */

#if defined(AO_HAVE_char_store_release) && !defined(AO_HAVE_char_store)
#  define AO_char_store(addr, val) AO_char_store_release(addr,val)
#  define AO_HAVE_char_store
#endif

#if defined(AO_HAVE_char_store_full) && !defined(AO_HAVE_char_store_release)
#  define AO_char_store_release(addr,val) AO_char_store_full(addr,val)
#  define AO_HAVE_char_store_release
#endif

#if defined(AO_HAVE_char_store_full) && !defined(AO_HAVE_char_store_write)
#  define AO_char_store_write(addr,val) AO_char_store_full(addr,val)
#  define AO_HAVE_char_store_write
#endif

#if defined(AO_HAVE_char_store_release) && \
        !defined(AO_HAVE_char_store_release_write)
#  define AO_char_store_release_write(addr, val) \
        AO_char_store_release(addr,val)
#  define AO_HAVE_char_store_release_write
#endif

#if defined(AO_HAVE_char_store_write) && !defined(AO_HAVE_char_store)
#  define AO_char_store(addr, val) AO_char_store_write(addr,val)
#  define AO_HAVE_char_store
#endif

#if defined(AO_HAVE_char_store) && defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_char_store_release)
#  define AO_char_store_release(addr,val) \
        (AO_nop_full(), AO_char_store(addr,val))
#  define AO_HAVE_char_store_release
#endif

#if defined(AO_HAVE_nop_write) && defined(AO_HAVE_char_store) && \
     !defined(AO_HAVE_char_store_write)
#  define AO_char_store_write(addr, val) \
        (AO_nop_write(), AO_char_store(addr,val))
#  define AO_HAVE_char_store_write
#endif

#if defined(AO_HAVE_char_store_write) && \
     !defined(AO_HAVE_char_store_release_write)
#  define AO_char_store_release_write(addr, val) AO_char_store_write(addr,val)
#  define AO_HAVE_char_store_release_write
#endif

#if defined(AO_HAVE_char_store_release) && defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_char_store_full)
#  define AO_char_store_full(addr, val) \
        (AO_char_store_release(addr, val), AO_nop_full())
#  define AO_HAVE_char_store_full
#endif


/* char_fetch_and_add */
#if defined(AO_HAVE_char_compare_and_swap_full) && \
    !defined(AO_HAVE_char_fetch_and_add_full)
   AO_INLINE AO_t
   AO_char_fetch_and_add_full(volatile unsigned char *addr,
                               unsigned char incr)
   {
     unsigned char old;
     do
       {
         old = *addr;
       }
     while (!AO_char_compare_and_swap_full(addr, old, old+incr));
     return old;
   }
#  define AO_HAVE_char_fetch_and_add_full
#endif

#if defined(AO_HAVE_char_compare_and_swap_acquire) && \
    !defined(AO_HAVE_char_fetch_and_add_acquire)
   AO_INLINE AO_t
   AO_char_fetch_and_add_acquire(volatile unsigned char *addr,
                                  unsigned char incr)
   {
     unsigned char old;
     do
       {
         old = *addr;
       }
     while (!AO_char_compare_and_swap_acquire(addr, old, old+incr));
     return old;
   }
#  define AO_HAVE_char_fetch_and_add_acquire
#endif

#if defined(AO_HAVE_char_compare_and_swap_release) && \
    !defined(AO_HAVE_char_fetch_and_add_release)
   AO_INLINE AO_t
   AO_char_fetch_and_add_release(volatile unsigned char *addr,
                                  unsigned char incr)
   {
     unsigned char old;
     do
       {
         old = *addr;
       }
     while (!AO_char_compare_and_swap_release(addr, old, old+incr));
     return old;
   }
#  define AO_HAVE_char_fetch_and_add_release
#endif

#if defined(AO_HAVE_char_fetch_and_add_full)
#  if !defined(AO_HAVE_char_fetch_and_add_release)
#    define AO_char_fetch_and_add_release(addr, val) \
         AO_char_fetch_and_add_full(addr, val)
#    define AO_HAVE_char_fetch_and_add_release
#  endif
#  if !defined(AO_HAVE_char_fetch_and_add_acquire)
#    define AO_char_fetch_and_add_acquire(addr, val) \
         AO_char_fetch_and_add_full(addr, val)
#    define AO_HAVE_char_fetch_and_add_acquire
#  endif
#  if !defined(AO_HAVE_char_fetch_and_add_write)
#    define AO_char_fetch_and_add_write(addr, val) \
         AO_char_fetch_and_add_full(addr, val)
#    define AO_HAVE_char_fetch_and_add_write
#  endif
#  if !defined(AO_HAVE_char_fetch_and_add_read)
#    define AO_char_fetch_and_add_read(addr, val) \
         AO_char_fetch_and_add_full(addr, val)
#    define AO_HAVE_char_fetch_and_add_read
#  endif
#endif /* AO_HAVE_char_fetch_and_add_full */

#if !defined(AO_HAVE_char_fetch_and_add) && \
    defined(AO_HAVE_char_fetch_and_add_release)
#  define AO_char_fetch_and_add(addr, val) \
        AO_char_fetch_and_add_release(addr, val)
#  define AO_HAVE_char_fetch_and_add
#endif
#if !defined(AO_HAVE_char_fetch_and_add) && \
    defined(AO_HAVE_char_fetch_and_add_acquire)
#  define AO_char_fetch_and_add(addr, val) \
        AO_char_fetch_and_add_acquire(addr, val)
#  define AO_HAVE_char_fetch_and_add
#endif
#if !defined(AO_HAVE_char_fetch_and_add) && \
    defined(AO_HAVE_char_fetch_and_add_write)
#  define AO_char_fetch_and_add(addr, val) \
        AO_char_fetch_and_add_write(addr, val)
#  define AO_HAVE_char_fetch_and_add
#endif
#if !defined(AO_HAVE_char_fetch_and_add) && \
    defined(AO_HAVE_char_fetch_and_add_read)
#  define AO_char_fetch_and_add(addr, val) \
        AO_char_fetch_and_add_read(addr, val)
#  define AO_HAVE_char_fetch_and_add
#endif

#if defined(AO_HAVE_char_fetch_and_add_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_char_fetch_and_add_full)
#  define AO_char_fetch_and_add_full(addr, val) \
        (AO_nop_full(), AO_char_fetch_and_add_acquire(addr, val))
#endif

#if !defined(AO_HAVE_char_fetch_and_add_release_write) && \
    defined(AO_HAVE_char_fetch_and_add_write)
#  define AO_char_fetch_and_add_release_write(addr, val) \
        AO_char_fetch_and_add_write(addr, val)
#  define AO_HAVE_char_fetch_and_add_release_write
#endif
#if !defined(AO_HAVE_char_fetch_and_add_release_write) && \
    defined(AO_HAVE_char_fetch_and_add_release)
#  define AO_char_fetch_and_add_release_write(addr, val) \
        AO_char_fetch_and_add_release(addr, val)
#  define AO_HAVE_char_fetch_and_add_release_write
#endif
#if !defined(AO_HAVE_char_fetch_and_add_acquire_read) && \
    defined(AO_HAVE_char_fetch_and_add_read)
#  define AO_char_fetch_and_add_acquire_read(addr, val) \
        AO_char_fetch_and_add_read(addr, val)
#  define AO_HAVE_char_fetch_and_add_acquire_read
#endif
#if !defined(AO_HAVE_char_fetch_and_add_acquire_read) && \
    defined(AO_HAVE_char_fetch_and_add_acquire)
#  define AO_char_fetch_and_add_acquire_read(addr, val) \
        AO_char_fetch_and_add_acquire(addr, val)
#  define AO_HAVE_char_fetch_and_add_acquire_read
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_char_fetch_and_add_acquire_read)
#    define AO_char_fetch_and_add_dd_acquire_read(addr, val) \
        AO_char_fetch_and_add_acquire_read(addr, val)
#    define AO_HAVE_char_fetch_and_add_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_char_fetch_and_add)
#    define AO_char_fetch_and_add_dd_acquire_read(addr, val) \
        AO_char_fetch_and_add(addr, val)
#    define AO_HAVE_char_fetch_and_add_dd_acquire_read
#  endif
#endif

/* char_fetch_and_add1 */

#if defined(AO_HAVE_char_fetch_and_add_full) &&\
    !defined(AO_HAVE_char_fetch_and_add1_full)
#  define AO_char_fetch_and_add1_full(addr) \
        AO_char_fetch_and_add_full(addr,1)
#  define AO_HAVE_char_fetch_and_add1_full
#endif
#if defined(AO_HAVE_char_fetch_and_add_release) &&\
    !defined(AO_HAVE_char_fetch_and_add1_release)
#  define AO_char_fetch_and_add1_release(addr) \
        AO_char_fetch_and_add_release(addr,1)
#  define AO_HAVE_char_fetch_and_add1_release
#endif
#if defined(AO_HAVE_char_fetch_and_add_acquire) &&\
    !defined(AO_HAVE_char_fetch_and_add1_acquire)
#  define AO_char_fetch_and_add1_acquire(addr) \
        AO_char_fetch_and_add_acquire(addr,1)
#  define AO_HAVE_char_fetch_and_add1_acquire
#endif
#if defined(AO_HAVE_char_fetch_and_add_write) &&\
    !defined(AO_HAVE_char_fetch_and_add1_write)
#  define AO_char_fetch_and_add1_write(addr) \
        AO_char_fetch_and_add_write(addr,1)
#  define AO_HAVE_char_fetch_and_add1_write
#endif
#if defined(AO_HAVE_char_fetch_and_add_read) &&\
    !defined(AO_HAVE_char_fetch_and_add1_read)
#  define AO_char_fetch_and_add1_read(addr) \
        AO_char_fetch_and_add_read(addr,1)
#  define AO_HAVE_char_fetch_and_add1_read
#endif
#if defined(AO_HAVE_char_fetch_and_add_release_write) &&\
    !defined(AO_HAVE_char_fetch_and_add1_release_write)
#  define AO_char_fetch_and_add1_release_write(addr) \
        AO_char_fetch_and_add_release_write(addr,1)
#  define AO_HAVE_char_fetch_and_add1_release_write
#endif
#if defined(AO_HAVE_char_fetch_and_add_acquire_read) &&\
    !defined(AO_HAVE_char_fetch_and_add1_acquire_read)
#  define AO_char_fetch_and_add1_acquire_read(addr) \
        AO_char_fetch_and_add_acquire_read(addr,1)
#  define AO_HAVE_char_fetch_and_add1_acquire_read
#endif
#if defined(AO_HAVE_char_fetch_and_add) &&\
    !defined(AO_HAVE_char_fetch_and_add1)
#  define AO_char_fetch_and_add1(addr) \
        AO_char_fetch_and_add(addr,1)
#  define AO_HAVE_char_fetch_and_add1
#endif

#if defined(AO_HAVE_char_fetch_and_add1_full)
#  if !defined(AO_HAVE_char_fetch_and_add1_release)
#    define AO_char_fetch_and_add1_release(addr) \
         AO_char_fetch_and_add1_full(addr)
#    define AO_HAVE_char_fetch_and_add1_release
#  endif
#  if !defined(AO_HAVE_char_fetch_and_add1_acquire)
#    define AO_char_fetch_and_add1_acquire(addr) \
         AO_char_fetch_and_add1_full(addr)
#    define AO_HAVE_char_fetch_and_add1_acquire
#  endif
#  if !defined(AO_HAVE_char_fetch_and_add1_write)
#    define AO_char_fetch_and_add1_write(addr) \
         AO_char_fetch_and_add1_full(addr)
#    define AO_HAVE_char_fetch_and_add1_write
#  endif
#  if !defined(AO_HAVE_char_fetch_and_add1_read)
#    define AO_char_fetch_and_add1_read(addr) \
         AO_char_fetch_and_add1_full(addr)
#    define AO_HAVE_char_fetch_and_add1_read
#  endif
#endif /* AO_HAVE_char_fetch_and_add1_full */

#if !defined(AO_HAVE_char_fetch_and_add1) && \
    defined(AO_HAVE_char_fetch_and_add1_release)
#  define AO_char_fetch_and_add1(addr) \
        AO_char_fetch_and_add1_release(addr)
#  define AO_HAVE_char_fetch_and_add1
#endif
#if !defined(AO_HAVE_char_fetch_and_add1) && \
    defined(AO_HAVE_char_fetch_and_add1_acquire)
#  define AO_char_fetch_and_add1(addr) \
        AO_char_fetch_and_add1_acquire(addr)
#  define AO_HAVE_char_fetch_and_add1
#endif
#if !defined(AO_HAVE_char_fetch_and_add1) && \
    defined(AO_HAVE_char_fetch_and_add1_write)
#  define AO_char_fetch_and_add1(addr) \
        AO_char_fetch_and_add1_write(addr)
#  define AO_HAVE_char_fetch_and_add1
#endif
#if !defined(AO_HAVE_char_fetch_and_add1) && \
    defined(AO_HAVE_char_fetch_and_add1_read)
#  define AO_char_fetch_and_add1(addr) \
        AO_char_fetch_and_add1_read(addr)
#  define AO_HAVE_char_fetch_and_add1
#endif

#if defined(AO_HAVE_char_fetch_and_add1_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_char_fetch_and_add1_full)
#  define AO_char_fetch_and_add1_full(addr) \
        (AO_nop_full(), AO_char_fetch_and_add1_acquire(addr))
#  define AO_HAVE_char_fetch_and_add1_full
#endif

#if !defined(AO_HAVE_char_fetch_and_add1_release_write) && \
    defined(AO_HAVE_char_fetch_and_add1_write)
#  define AO_char_fetch_and_add1_release_write(addr) \
        AO_char_fetch_and_add1_write(addr)
#  define AO_HAVE_char_fetch_and_add1_release_write
#endif
#if !defined(AO_HAVE_char_fetch_and_add1_release_write) && \
    defined(AO_HAVE_char_fetch_and_add1_release)
#  define AO_char_fetch_and_add1_release_write(addr) \
        AO_char_fetch_and_add1_release(addr)
#  define AO_HAVE_char_fetch_and_add1_release_write
#endif
#if !defined(AO_HAVE_char_fetch_and_add1_acquire_read) && \
    defined(AO_HAVE_char_fetch_and_add1_read)
#  define AO_char_fetch_and_add1_acquire_read(addr) \
        AO_char_fetch_and_add1_read(addr)
#  define AO_HAVE_char_fetch_and_add1_acquire_read
#endif
#if !defined(AO_HAVE_char_fetch_and_add1_acquire_read) && \
    defined(AO_HAVE_char_fetch_and_add1_acquire)
#  define AO_char_fetch_and_add1_acquire_read(addr) \
        AO_char_fetch_and_add1_acquire(addr)
#  define AO_HAVE_char_fetch_and_add1_acquire_read
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_char_fetch_and_add1_acquire_read)
#    define AO_char_fetch_and_add1_dd_acquire_read(addr) \
        AO_char_fetch_and_add1_acquire_read(addr)
#    define AO_HAVE_char_fetch_and_add1_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_char_fetch_and_add1)
#    define AO_char_fetch_and_add1_dd_acquire_read(addr) \
        AO_char_fetch_and_add1(addr)
#    define AO_HAVE_char_fetch_and_add1_dd_acquire_read
#  endif
#endif

/* char_fetch_and_sub1 */

#if defined(AO_HAVE_char_fetch_and_add_full) &&\
    !defined(AO_HAVE_char_fetch_and_sub1_full)
#  define AO_char_fetch_and_sub1_full(addr) \
        AO_char_fetch_and_add_full(addr,(unsigned char)(-1))
#  define AO_HAVE_char_fetch_and_sub1_full
#endif
#if defined(AO_HAVE_char_fetch_and_add_release) &&\
    !defined(AO_HAVE_char_fetch_and_sub1_release)
#  define AO_char_fetch_and_sub1_release(addr) \
        AO_char_fetch_and_add_release(addr,(unsigned char)(-1))
#  define AO_HAVE_char_fetch_and_sub1_release
#endif
#if defined(AO_HAVE_char_fetch_and_add_acquire) &&\
    !defined(AO_HAVE_char_fetch_and_sub1_acquire)
#  define AO_char_fetch_and_sub1_acquire(addr) \
        AO_char_fetch_and_add_acquire(addr,(unsigned char)(-1))
#  define AO_HAVE_char_fetch_and_sub1_acquire
#endif
#if defined(AO_HAVE_char_fetch_and_add_write) &&\
    !defined(AO_HAVE_char_fetch_and_sub1_write)
#  define AO_char_fetch_and_sub1_write(addr) \
        AO_char_fetch_and_add_write(addr,(unsigned char)(-1))
#  define AO_HAVE_char_fetch_and_sub1_write
#endif
#if defined(AO_HAVE_char_fetch_and_add_read) &&\
    !defined(AO_HAVE_char_fetch_and_sub1_read)
#  define AO_char_fetch_and_sub1_read(addr) \
        AO_char_fetch_and_add_read(addr,(unsigned char)(-1))
#  define AO_HAVE_char_fetch_and_sub1_read
#endif
#if defined(AO_HAVE_char_fetch_and_add_release_write) &&\
    !defined(AO_HAVE_char_fetch_and_sub1_release_write)
#  define AO_char_fetch_and_sub1_release_write(addr) \
        AO_char_fetch_and_add_release_write(addr,(unsigned char)(-1))
#  define AO_HAVE_char_fetch_and_sub1_release_write
#endif
#if defined(AO_HAVE_char_fetch_and_add_acquire_read) &&\
    !defined(AO_HAVE_char_fetch_and_sub1_acquire_read)
#  define AO_char_fetch_and_sub1_acquire_read(addr) \
        AO_char_fetch_and_add_acquire_read(addr,(unsigned char)(-1))
#  define AO_HAVE_char_fetch_and_sub1_acquire_read
#endif
#if defined(AO_HAVE_char_fetch_and_add) &&\
    !defined(AO_HAVE_char_fetch_and_sub1)
#  define AO_char_fetch_and_sub1(addr) \
        AO_char_fetch_and_add(addr,(unsigned char)(-1))
#  define AO_HAVE_char_fetch_and_sub1
#endif

#if defined(AO_HAVE_char_fetch_and_sub1_full)
#  if !defined(AO_HAVE_char_fetch_and_sub1_release)
#    define AO_char_fetch_and_sub1_release(addr) \
         AO_char_fetch_and_sub1_full(addr)
#    define AO_HAVE_char_fetch_and_sub1_release
#  endif
#  if !defined(AO_HAVE_char_fetch_and_sub1_acquire)
#    define AO_char_fetch_and_sub1_acquire(addr) \
         AO_char_fetch_and_sub1_full(addr)
#    define AO_HAVE_char_fetch_and_sub1_acquire
#  endif
#  if !defined(AO_HAVE_char_fetch_and_sub1_write)
#    define AO_char_fetch_and_sub1_write(addr) \
         AO_char_fetch_and_sub1_full(addr)
#    define AO_HAVE_char_fetch_and_sub1_write
#  endif
#  if !defined(AO_HAVE_char_fetch_and_sub1_read)
#    define AO_char_fetch_and_sub1_read(addr) \
         AO_char_fetch_and_sub1_full(addr)
#    define AO_HAVE_char_fetch_and_sub1_read
#  endif
#endif /* AO_HAVE_char_fetch_and_sub1_full */

#if !defined(AO_HAVE_char_fetch_and_sub1) && \
    defined(AO_HAVE_char_fetch_and_sub1_release)
#  define AO_char_fetch_and_sub1(addr) \
        AO_char_fetch_and_sub1_release(addr)
#  define AO_HAVE_char_fetch_and_sub1
#endif
#if !defined(AO_HAVE_char_fetch_and_sub1) && \
    defined(AO_HAVE_char_fetch_and_sub1_acquire)
#  define AO_char_fetch_and_sub1(addr) \
        AO_char_fetch_and_sub1_acquire(addr)
#  define AO_HAVE_char_fetch_and_sub1
#endif
#if !defined(AO_HAVE_char_fetch_and_sub1) && \
    defined(AO_HAVE_char_fetch_and_sub1_write)
#  define AO_char_fetch_and_sub1(addr) \
        AO_char_fetch_and_sub1_write(addr)
#  define AO_HAVE_char_fetch_and_sub1
#endif
#if !defined(AO_HAVE_char_fetch_and_sub1) && \
    defined(AO_HAVE_char_fetch_and_sub1_read)
#  define AO_char_fetch_and_sub1(addr) \
        AO_char_fetch_and_sub1_read(addr)
#  define AO_HAVE_char_fetch_and_sub1
#endif

#if defined(AO_HAVE_char_fetch_and_sub1_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_char_fetch_and_sub1_full)
#  define AO_char_fetch_and_sub1_full(addr) \
        (AO_nop_full(), AO_char_fetch_and_sub1_acquire(addr))
#  define AO_HAVE_char_fetch_and_sub1_full
#endif

#if !defined(AO_HAVE_char_fetch_and_sub1_release_write) && \
    defined(AO_HAVE_char_fetch_and_sub1_write)
#  define AO_char_fetch_and_sub1_release_write(addr) \
        AO_char_fetch_and_sub1_write(addr)
#  define AO_HAVE_char_fetch_and_sub1_release_write
#endif
#if !defined(AO_HAVE_char_fetch_and_sub1_release_write) && \
    defined(AO_HAVE_char_fetch_and_sub1_release)
#  define AO_char_fetch_and_sub1_release_write(addr) \
        AO_char_fetch_and_sub1_release(addr)
#  define AO_HAVE_char_fetch_and_sub1_release_write
#endif
#if !defined(AO_HAVE_char_fetch_and_sub1_acquire_read) && \
    defined(AO_HAVE_char_fetch_and_sub1_read)
#  define AO_char_fetch_and_sub1_acquire_read(addr) \
        AO_char_fetch_and_sub1_read(addr)
#  define AO_HAVE_char_fetch_and_sub1_acquire_read
#endif
#if !defined(AO_HAVE_char_fetch_and_sub1_acquire_read) && \
    defined(AO_HAVE_char_fetch_and_sub1_acquire)
#  define AO_char_fetch_and_sub1_acquire_read(addr) \
        AO_char_fetch_and_sub1_acquire(addr)
#  define AO_HAVE_char_fetch_and_sub1_acquire_read
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_char_fetch_and_sub1_acquire_read)
#    define AO_char_fetch_and_sub1_dd_acquire_read(addr) \
        AO_char_fetch_and_sub1_acquire_read(addr)
#    define AO_HAVE_char_fetch_and_sub1_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_char_fetch_and_sub1)
#    define AO_char_fetch_and_sub1_dd_acquire_read(addr) \
        AO_char_fetch_and_sub1(addr)
#    define AO_HAVE_char_fetch_and_sub1_dd_acquire_read
#  endif
#endif

/* short_load */
#if defined(AO_HAVE_short_load_acquire) && !defined(AO_HAVE_short_load)
#  define AO_short_load(addr) AO_short_load_acquire(addr)
#  define AO_HAVE_short_load
#endif

#if defined(AO_HAVE_short_load_full) && !defined(AO_HAVE_short_load_acquire)
#  define AO_short_load_acquire(addr) AO_short_load_full(addr)
#  define AO_HAVE_short_load_acquire
#endif

#if defined(AO_HAVE_short_load_full) && !defined(AO_HAVE_short_load_read)
#  define AO_short_load_read(addr) AO_short_load_full(addr)
#  define AO_HAVE_short_load_read
#endif

#if !defined(AO_HAVE_short_load_acquire_read) && defined(AO_HAVE_short_load_acquire)
#  define AO_short_load_acquire_read(addr) AO_short_load_acquire(addr)
#  define AO_HAVE_short_load_acquire_read
#endif

#if defined(AO_HAVE_short_load) && defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_short_load_acquire)
   AO_INLINE unsigned short
   AO_short_load_acquire(const volatile unsigned short *addr)
   {
     unsigned short result = AO_short_load(addr);
     /* Acquire barrier would be useless, since the load could be delayed  */
     /* beyond it.                                                         */
     AO_nop_full();
     return result;
   }
#  define AO_HAVE_short_load_acquire
#endif

#if defined(AO_HAVE_short_load) && defined(AO_HAVE_nop_read) && \
    !defined(AO_HAVE_short_load_read)
   AO_INLINE unsigned short
   AO_short_load_read(const volatile unsigned short *addr)
   {
     unsigned short result = AO_short_load(addr);
     /* Acquire barrier would be useless, since the load could be delayed  */
     /* beyond it.                                                         */
     AO_nop_read();
     return result;
   }
#  define AO_HAVE_short_load_read
#endif

#if defined(AO_HAVE_short_load_acquire) && defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_short_load_full)
#  define AO_short_load_full(addr) (AO_nop_full(), AO_short_load_acquire(addr))
#  define AO_HAVE_short_load_full
#endif

#if !defined(AO_HAVE_short_load_acquire_read) && defined(AO_HAVE_short_load_read)
#  define AO_short_load_acquire_read(addr) AO_short_load_read(addr)
#  define AO_HAVE_short_load_acquire_read
#endif

#if defined(AO_HAVE_short_load_acquire_read) && !defined(AO_HAVE_short_load)
#  define AO_short_load(addr) AO_short_load_acquire_read(addr)
#  define AO_HAVE_short_load
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_short_load_acquire_read)
#    define AO_short_load_dd_acquire_read(addr) \
        AO_short_load_acquire_read(addr)
#    define AO_HAVE_short_load_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_short_load)
#    define AO_short_load_dd_acquire_read(addr) \
        AO_short_load(addr)
#    define AO_HAVE_short_load_dd_acquire_read
#  endif
#endif


/* short_store */

#if defined(AO_HAVE_short_store_release) && !defined(AO_HAVE_short_store)
#  define AO_short_store(addr, val) AO_short_store_release(addr,val)
#  define AO_HAVE_short_store
#endif

#if defined(AO_HAVE_short_store_full) && !defined(AO_HAVE_short_store_release)
#  define AO_short_store_release(addr,val) AO_short_store_full(addr,val)
#  define AO_HAVE_short_store_release
#endif

#if defined(AO_HAVE_short_store_full) && !defined(AO_HAVE_short_store_write)
#  define AO_short_store_write(addr,val) AO_short_store_full(addr,val)
#  define AO_HAVE_short_store_write
#endif

#if defined(AO_HAVE_short_store_release) && \
        !defined(AO_HAVE_short_store_release_write)
#  define AO_short_store_release_write(addr, val) \
        AO_short_store_release(addr,val)
#  define AO_HAVE_short_store_release_write
#endif

#if defined(AO_HAVE_short_store_write) && !defined(AO_HAVE_short_store)
#  define AO_short_store(addr, val) AO_short_store_write(addr,val)
#  define AO_HAVE_short_store
#endif

#if defined(AO_HAVE_short_store) && defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_short_store_release)
#  define AO_short_store_release(addr,val) \
        (AO_nop_full(), AO_short_store(addr,val))
#  define AO_HAVE_short_store_release
#endif

#if defined(AO_HAVE_nop_write) && defined(AO_HAVE_short_store) && \
     !defined(AO_HAVE_short_store_write)
#  define AO_short_store_write(addr, val) \
        (AO_nop_write(), AO_short_store(addr,val))
#  define AO_HAVE_short_store_write
#endif

#if defined(AO_HAVE_short_store_write) && \
     !defined(AO_HAVE_short_store_release_write)
#  define AO_short_store_release_write(addr, val) AO_short_store_write(addr,val)
#  define AO_HAVE_short_store_release_write
#endif

#if defined(AO_HAVE_short_store_release) && defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_short_store_full)
#  define AO_short_store_full(addr, val) \
        (AO_short_store_release(addr, val), AO_nop_full())
#  define AO_HAVE_short_store_full
#endif


/* short_fetch_and_add */
#if defined(AO_HAVE_short_compare_and_swap_full) && \
    !defined(AO_HAVE_short_fetch_and_add_full)
   AO_INLINE AO_t
   AO_short_fetch_and_add_full(volatile unsigned short *addr,
                               unsigned short incr)
   {
     unsigned short old;
     do
       {
         old = *addr;
       }
     while (!AO_short_compare_and_swap_full(addr, old, old+incr));
     return old;
   }
#  define AO_HAVE_short_fetch_and_add_full
#endif

#if defined(AO_HAVE_short_compare_and_swap_acquire) && \
    !defined(AO_HAVE_short_fetch_and_add_acquire)
   AO_INLINE AO_t
   AO_short_fetch_and_add_acquire(volatile unsigned short *addr,
                                  unsigned short incr)
   {
     unsigned short old;
     do
       {
         old = *addr;
       }
     while (!AO_short_compare_and_swap_acquire(addr, old, old+incr));
     return old;
   }
#  define AO_HAVE_short_fetch_and_add_acquire
#endif

#if defined(AO_HAVE_short_compare_and_swap_release) && \
    !defined(AO_HAVE_short_fetch_and_add_release)
   AO_INLINE AO_t
   AO_short_fetch_and_add_release(volatile unsigned short *addr,
                                  unsigned short incr)
   {
     unsigned short old;
     do
       {
         old = *addr;
       }
     while (!AO_short_compare_and_swap_release(addr, old, old+incr));
     return old;
   }
#  define AO_HAVE_short_fetch_and_add_release
#endif

#if defined(AO_HAVE_short_fetch_and_add_full)
#  if !defined(AO_HAVE_short_fetch_and_add_release)
#    define AO_short_fetch_and_add_release(addr, val) \
         AO_short_fetch_and_add_full(addr, val)
#    define AO_HAVE_short_fetch_and_add_release
#  endif
#  if !defined(AO_HAVE_short_fetch_and_add_acquire)
#    define AO_short_fetch_and_add_acquire(addr, val) \
         AO_short_fetch_and_add_full(addr, val)
#    define AO_HAVE_short_fetch_and_add_acquire
#  endif
#  if !defined(AO_HAVE_short_fetch_and_add_write)
#    define AO_short_fetch_and_add_write(addr, val) \
         AO_short_fetch_and_add_full(addr, val)
#    define AO_HAVE_short_fetch_and_add_write
#  endif
#  if !defined(AO_HAVE_short_fetch_and_add_read)
#    define AO_short_fetch_and_add_read(addr, val) \
         AO_short_fetch_and_add_full(addr, val)
#    define AO_HAVE_short_fetch_and_add_read
#  endif
#endif /* AO_HAVE_short_fetch_and_add_full */

#if !defined(AO_HAVE_short_fetch_and_add) && \
    defined(AO_HAVE_short_fetch_and_add_release)
#  define AO_short_fetch_and_add(addr, val) \
        AO_short_fetch_and_add_release(addr, val)
#  define AO_HAVE_short_fetch_and_add
#endif
#if !defined(AO_HAVE_short_fetch_and_add) && \
    defined(AO_HAVE_short_fetch_and_add_acquire)
#  define AO_short_fetch_and_add(addr, val) \
        AO_short_fetch_and_add_acquire(addr, val)
#  define AO_HAVE_short_fetch_and_add
#endif
#if !defined(AO_HAVE_short_fetch_and_add) && \
    defined(AO_HAVE_short_fetch_and_add_write)
#  define AO_short_fetch_and_add(addr, val) \
        AO_short_fetch_and_add_write(addr, val)
#  define AO_HAVE_short_fetch_and_add
#endif
#if !defined(AO_HAVE_short_fetch_and_add) && \
    defined(AO_HAVE_short_fetch_and_add_read)
#  define AO_short_fetch_and_add(addr, val) \
        AO_short_fetch_and_add_read(addr, val)
#  define AO_HAVE_short_fetch_and_add
#endif

#if defined(AO_HAVE_short_fetch_and_add_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_short_fetch_and_add_full)
#  define AO_short_fetch_and_add_full(addr, val) \
        (AO_nop_full(), AO_short_fetch_and_add_acquire(addr, val))
#endif

#if !defined(AO_HAVE_short_fetch_and_add_release_write) && \
    defined(AO_HAVE_short_fetch_and_add_write)
#  define AO_short_fetch_and_add_release_write(addr, val) \
        AO_short_fetch_and_add_write(addr, val)
#  define AO_HAVE_short_fetch_and_add_release_write
#endif
#if !defined(AO_HAVE_short_fetch_and_add_release_write) && \
    defined(AO_HAVE_short_fetch_and_add_release)
#  define AO_short_fetch_and_add_release_write(addr, val) \
        AO_short_fetch_and_add_release(addr, val)
#  define AO_HAVE_short_fetch_and_add_release_write
#endif
#if !defined(AO_HAVE_short_fetch_and_add_acquire_read) && \
    defined(AO_HAVE_short_fetch_and_add_read)
#  define AO_short_fetch_and_add_acquire_read(addr, val) \
        AO_short_fetch_and_add_read(addr, val)
#  define AO_HAVE_short_fetch_and_add_acquire_read
#endif
#if !defined(AO_HAVE_short_fetch_and_add_acquire_read) && \
    defined(AO_HAVE_short_fetch_and_add_acquire)
#  define AO_short_fetch_and_add_acquire_read(addr, val) \
        AO_short_fetch_and_add_acquire(addr, val)
#  define AO_HAVE_short_fetch_and_add_acquire_read
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_short_fetch_and_add_acquire_read)
#    define AO_short_fetch_and_add_dd_acquire_read(addr, val) \
        AO_short_fetch_and_add_acquire_read(addr, val)
#    define AO_HAVE_short_fetch_and_add_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_short_fetch_and_add)
#    define AO_short_fetch_and_add_dd_acquire_read(addr, val) \
        AO_short_fetch_and_add(addr, val)
#    define AO_HAVE_short_fetch_and_add_dd_acquire_read
#  endif
#endif

/* short_fetch_and_add1 */

#if defined(AO_HAVE_short_fetch_and_add_full) &&\
    !defined(AO_HAVE_short_fetch_and_add1_full)
#  define AO_short_fetch_and_add1_full(addr) \
        AO_short_fetch_and_add_full(addr,1)
#  define AO_HAVE_short_fetch_and_add1_full
#endif
#if defined(AO_HAVE_short_fetch_and_add_release) &&\
    !defined(AO_HAVE_short_fetch_and_add1_release)
#  define AO_short_fetch_and_add1_release(addr) \
        AO_short_fetch_and_add_release(addr,1)
#  define AO_HAVE_short_fetch_and_add1_release
#endif
#if defined(AO_HAVE_short_fetch_and_add_acquire) &&\
    !defined(AO_HAVE_short_fetch_and_add1_acquire)
#  define AO_short_fetch_and_add1_acquire(addr) \
        AO_short_fetch_and_add_acquire(addr,1)
#  define AO_HAVE_short_fetch_and_add1_acquire
#endif
#if defined(AO_HAVE_short_fetch_and_add_write) &&\
    !defined(AO_HAVE_short_fetch_and_add1_write)
#  define AO_short_fetch_and_add1_write(addr) \
        AO_short_fetch_and_add_write(addr,1)
#  define AO_HAVE_short_fetch_and_add1_write
#endif
#if defined(AO_HAVE_short_fetch_and_add_read) &&\
    !defined(AO_HAVE_short_fetch_and_add1_read)
#  define AO_short_fetch_and_add1_read(addr) \
        AO_short_fetch_and_add_read(addr,1)
#  define AO_HAVE_short_fetch_and_add1_read
#endif
#if defined(AO_HAVE_short_fetch_and_add_release_write) &&\
    !defined(AO_HAVE_short_fetch_and_add1_release_write)
#  define AO_short_fetch_and_add1_release_write(addr) \
        AO_short_fetch_and_add_release_write(addr,1)
#  define AO_HAVE_short_fetch_and_add1_release_write
#endif
#if defined(AO_HAVE_short_fetch_and_add_acquire_read) &&\
    !defined(AO_HAVE_short_fetch_and_add1_acquire_read)
#  define AO_short_fetch_and_add1_acquire_read(addr) \
        AO_short_fetch_and_add_acquire_read(addr,1)
#  define AO_HAVE_short_fetch_and_add1_acquire_read
#endif
#if defined(AO_HAVE_short_fetch_and_add) &&\
    !defined(AO_HAVE_short_fetch_and_add1)
#  define AO_short_fetch_and_add1(addr) \
        AO_short_fetch_and_add(addr,1)
#  define AO_HAVE_short_fetch_and_add1
#endif

#if defined(AO_HAVE_short_fetch_and_add1_full)
#  if !defined(AO_HAVE_short_fetch_and_add1_release)
#    define AO_short_fetch_and_add1_release(addr) \
         AO_short_fetch_and_add1_full(addr)
#    define AO_HAVE_short_fetch_and_add1_release
#  endif
#  if !defined(AO_HAVE_short_fetch_and_add1_acquire)
#    define AO_short_fetch_and_add1_acquire(addr) \
         AO_short_fetch_and_add1_full(addr)
#    define AO_HAVE_short_fetch_and_add1_acquire
#  endif
#  if !defined(AO_HAVE_short_fetch_and_add1_write)
#    define AO_short_fetch_and_add1_write(addr) \
         AO_short_fetch_and_add1_full(addr)
#    define AO_HAVE_short_fetch_and_add1_write
#  endif
#  if !defined(AO_HAVE_short_fetch_and_add1_read)
#    define AO_short_fetch_and_add1_read(addr) \
         AO_short_fetch_and_add1_full(addr)
#    define AO_HAVE_short_fetch_and_add1_read
#  endif
#endif /* AO_HAVE_short_fetch_and_add1_full */

#if !defined(AO_HAVE_short_fetch_and_add1) && \
    defined(AO_HAVE_short_fetch_and_add1_release)
#  define AO_short_fetch_and_add1(addr) \
        AO_short_fetch_and_add1_release(addr)
#  define AO_HAVE_short_fetch_and_add1
#endif
#if !defined(AO_HAVE_short_fetch_and_add1) && \
    defined(AO_HAVE_short_fetch_and_add1_acquire)
#  define AO_short_fetch_and_add1(addr) \
        AO_short_fetch_and_add1_acquire(addr)
#  define AO_HAVE_short_fetch_and_add1
#endif
#if !defined(AO_HAVE_short_fetch_and_add1) && \
    defined(AO_HAVE_short_fetch_and_add1_write)
#  define AO_short_fetch_and_add1(addr) \
        AO_short_fetch_and_add1_write(addr)
#  define AO_HAVE_short_fetch_and_add1
#endif
#if !defined(AO_HAVE_short_fetch_and_add1) && \
    defined(AO_HAVE_short_fetch_and_add1_read)
#  define AO_short_fetch_and_add1(addr) \
        AO_short_fetch_and_add1_read(addr)
#  define AO_HAVE_short_fetch_and_add1
#endif

#if defined(AO_HAVE_short_fetch_and_add1_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_short_fetch_and_add1_full)
#  define AO_short_fetch_and_add1_full(addr) \
        (AO_nop_full(), AO_short_fetch_and_add1_acquire(addr))
#  define AO_HAVE_short_fetch_and_add1_full
#endif

#if !defined(AO_HAVE_short_fetch_and_add1_release_write) && \
    defined(AO_HAVE_short_fetch_and_add1_write)
#  define AO_short_fetch_and_add1_release_write(addr) \
        AO_short_fetch_and_add1_write(addr)
#  define AO_HAVE_short_fetch_and_add1_release_write
#endif
#if !defined(AO_HAVE_short_fetch_and_add1_release_write) && \
    defined(AO_HAVE_short_fetch_and_add1_release)
#  define AO_short_fetch_and_add1_release_write(addr) \
        AO_short_fetch_and_add1_release(addr)
#  define AO_HAVE_short_fetch_and_add1_release_write
#endif
#if !defined(AO_HAVE_short_fetch_and_add1_acquire_read) && \
    defined(AO_HAVE_short_fetch_and_add1_read)
#  define AO_short_fetch_and_add1_acquire_read(addr) \
        AO_short_fetch_and_add1_read(addr)
#  define AO_HAVE_short_fetch_and_add1_acquire_read
#endif
#if !defined(AO_HAVE_short_fetch_and_add1_acquire_read) && \
    defined(AO_HAVE_short_fetch_and_add1_acquire)
#  define AO_short_fetch_and_add1_acquire_read(addr) \
        AO_short_fetch_and_add1_acquire(addr)
#  define AO_HAVE_short_fetch_and_add1_acquire_read
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_short_fetch_and_add1_acquire_read)
#    define AO_short_fetch_and_add1_dd_acquire_read(addr) \
        AO_short_fetch_and_add1_acquire_read(addr)
#    define AO_HAVE_short_fetch_and_add1_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_short_fetch_and_add1)
#    define AO_short_fetch_and_add1_dd_acquire_read(addr) \
        AO_short_fetch_and_add1(addr)
#    define AO_HAVE_short_fetch_and_add1_dd_acquire_read
#  endif
#endif

/* short_fetch_and_sub1 */

#if defined(AO_HAVE_short_fetch_and_add_full) &&\
    !defined(AO_HAVE_short_fetch_and_sub1_full)
#  define AO_short_fetch_and_sub1_full(addr) \
        AO_short_fetch_and_add_full(addr,(unsigned short)(-1))
#  define AO_HAVE_short_fetch_and_sub1_full
#endif
#if defined(AO_HAVE_short_fetch_and_add_release) &&\
    !defined(AO_HAVE_short_fetch_and_sub1_release)
#  define AO_short_fetch_and_sub1_release(addr) \
        AO_short_fetch_and_add_release(addr,(unsigned short)(-1))
#  define AO_HAVE_short_fetch_and_sub1_release
#endif
#if defined(AO_HAVE_short_fetch_and_add_acquire) &&\
    !defined(AO_HAVE_short_fetch_and_sub1_acquire)
#  define AO_short_fetch_and_sub1_acquire(addr) \
        AO_short_fetch_and_add_acquire(addr,(unsigned short)(-1))
#  define AO_HAVE_short_fetch_and_sub1_acquire
#endif
#if defined(AO_HAVE_short_fetch_and_add_write) &&\
    !defined(AO_HAVE_short_fetch_and_sub1_write)
#  define AO_short_fetch_and_sub1_write(addr) \
        AO_short_fetch_and_add_write(addr,(unsigned short)(-1))
#  define AO_HAVE_short_fetch_and_sub1_write
#endif
#if defined(AO_HAVE_short_fetch_and_add_read) &&\
    !defined(AO_HAVE_short_fetch_and_sub1_read)
#  define AO_short_fetch_and_sub1_read(addr) \
        AO_short_fetch_and_add_read(addr,(unsigned short)(-1))
#  define AO_HAVE_short_fetch_and_sub1_read
#endif
#if defined(AO_HAVE_short_fetch_and_add_release_write) &&\
    !defined(AO_HAVE_short_fetch_and_sub1_release_write)
#  define AO_short_fetch_and_sub1_release_write(addr) \
        AO_short_fetch_and_add_release_write(addr,(unsigned short)(-1))
#  define AO_HAVE_short_fetch_and_sub1_release_write
#endif
#if defined(AO_HAVE_short_fetch_and_add_acquire_read) &&\
    !defined(AO_HAVE_short_fetch_and_sub1_acquire_read)
#  define AO_short_fetch_and_sub1_acquire_read(addr) \
        AO_short_fetch_and_add_acquire_read(addr,(unsigned short)(-1))
#  define AO_HAVE_short_fetch_and_sub1_acquire_read
#endif
#if defined(AO_HAVE_short_fetch_and_add) &&\
    !defined(AO_HAVE_short_fetch_and_sub1)
#  define AO_short_fetch_and_sub1(addr) \
        AO_short_fetch_and_add(addr,(unsigned short)(-1))
#  define AO_HAVE_short_fetch_and_sub1
#endif

#if defined(AO_HAVE_short_fetch_and_sub1_full)
#  if !defined(AO_HAVE_short_fetch_and_sub1_release)
#    define AO_short_fetch_and_sub1_release(addr) \
         AO_short_fetch_and_sub1_full(addr)
#    define AO_HAVE_short_fetch_and_sub1_release
#  endif
#  if !defined(AO_HAVE_short_fetch_and_sub1_acquire)
#    define AO_short_fetch_and_sub1_acquire(addr) \
         AO_short_fetch_and_sub1_full(addr)
#    define AO_HAVE_short_fetch_and_sub1_acquire
#  endif
#  if !defined(AO_HAVE_short_fetch_and_sub1_write)
#    define AO_short_fetch_and_sub1_write(addr) \
         AO_short_fetch_and_sub1_full(addr)
#    define AO_HAVE_short_fetch_and_sub1_write
#  endif
#  if !defined(AO_HAVE_short_fetch_and_sub1_read)
#    define AO_short_fetch_and_sub1_read(addr) \
         AO_short_fetch_and_sub1_full(addr)
#    define AO_HAVE_short_fetch_and_sub1_read
#  endif
#endif /* AO_HAVE_short_fetch_and_sub1_full */

#if !defined(AO_HAVE_short_fetch_and_sub1) && \
    defined(AO_HAVE_short_fetch_and_sub1_release)
#  define AO_short_fetch_and_sub1(addr) \
        AO_short_fetch_and_sub1_release(addr)
#  define AO_HAVE_short_fetch_and_sub1
#endif
#if !defined(AO_HAVE_short_fetch_and_sub1) && \
    defined(AO_HAVE_short_fetch_and_sub1_acquire)
#  define AO_short_fetch_and_sub1(addr) \
        AO_short_fetch_and_sub1_acquire(addr)
#  define AO_HAVE_short_fetch_and_sub1
#endif
#if !defined(AO_HAVE_short_fetch_and_sub1) && \
    defined(AO_HAVE_short_fetch_and_sub1_write)
#  define AO_short_fetch_and_sub1(addr) \
        AO_short_fetch_and_sub1_write(addr)
#  define AO_HAVE_short_fetch_and_sub1
#endif
#if !defined(AO_HAVE_short_fetch_and_sub1) && \
    defined(AO_HAVE_short_fetch_and_sub1_read)
#  define AO_short_fetch_and_sub1(addr) \
        AO_short_fetch_and_sub1_read(addr)
#  define AO_HAVE_short_fetch_and_sub1
#endif

#if defined(AO_HAVE_short_fetch_and_sub1_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_short_fetch_and_sub1_full)
#  define AO_short_fetch_and_sub1_full(addr) \
        (AO_nop_full(), AO_short_fetch_and_sub1_acquire(addr))
#  define AO_HAVE_short_fetch_and_sub1_full
#endif

#if !defined(AO_HAVE_short_fetch_and_sub1_release_write) && \
    defined(AO_HAVE_short_fetch_and_sub1_write)
#  define AO_short_fetch_and_sub1_release_write(addr) \
        AO_short_fetch_and_sub1_write(addr)
#  define AO_HAVE_short_fetch_and_sub1_release_write
#endif
#if !defined(AO_HAVE_short_fetch_and_sub1_release_write) && \
    defined(AO_HAVE_short_fetch_and_sub1_release)
#  define AO_short_fetch_and_sub1_release_write(addr) \
        AO_short_fetch_and_sub1_release(addr)
#  define AO_HAVE_short_fetch_and_sub1_release_write
#endif
#if !defined(AO_HAVE_short_fetch_and_sub1_acquire_read) && \
    defined(AO_HAVE_short_fetch_and_sub1_read)
#  define AO_short_fetch_and_sub1_acquire_read(addr) \
        AO_short_fetch_and_sub1_read(addr)
#  define AO_HAVE_short_fetch_and_sub1_acquire_read
#endif
#if !defined(AO_HAVE_short_fetch_and_sub1_acquire_read) && \
    defined(AO_HAVE_short_fetch_and_sub1_acquire)
#  define AO_short_fetch_and_sub1_acquire_read(addr) \
        AO_short_fetch_and_sub1_acquire(addr)
#  define AO_HAVE_short_fetch_and_sub1_acquire_read
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_short_fetch_and_sub1_acquire_read)
#    define AO_short_fetch_and_sub1_dd_acquire_read(addr) \
        AO_short_fetch_and_sub1_acquire_read(addr)
#    define AO_HAVE_short_fetch_and_sub1_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_short_fetch_and_sub1)
#    define AO_short_fetch_and_sub1_dd_acquire_read(addr) \
        AO_short_fetch_and_sub1(addr)
#    define AO_HAVE_short_fetch_and_sub1_dd_acquire_read
#  endif
#endif

/* int_load */
#if defined(AO_HAVE_int_load_acquire) && !defined(AO_HAVE_int_load)
#  define AO_int_load(addr) AO_int_load_acquire(addr)
#  define AO_HAVE_int_load
#endif

#if defined(AO_HAVE_int_load_full) && !defined(AO_HAVE_int_load_acquire)
#  define AO_int_load_acquire(addr) AO_int_load_full(addr)
#  define AO_HAVE_int_load_acquire
#endif

#if defined(AO_HAVE_int_load_full) && !defined(AO_HAVE_int_load_read)
#  define AO_int_load_read(addr) AO_int_load_full(addr)
#  define AO_HAVE_int_load_read
#endif

#if !defined(AO_HAVE_int_load_acquire_read) && defined(AO_HAVE_int_load_acquire)
#  define AO_int_load_acquire_read(addr) AO_int_load_acquire(addr)
#  define AO_HAVE_int_load_acquire_read
#endif

#if defined(AO_HAVE_int_load) && defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_int_load_acquire)
   AO_INLINE unsigned int
   AO_int_load_acquire(const volatile unsigned int *addr)
   {
     unsigned int result = AO_int_load(addr);
     /* Acquire barrier would be useless, since the load could be delayed  */
     /* beyond it.                                                         */
     AO_nop_full();
     return result;
   }
#  define AO_HAVE_int_load_acquire
#endif

#if defined(AO_HAVE_int_load) && defined(AO_HAVE_nop_read) && \
    !defined(AO_HAVE_int_load_read)
   AO_INLINE unsigned int
   AO_int_load_read(const volatile unsigned int *addr)
   {
     unsigned int result = AO_int_load(addr);
     /* Acquire barrier would be useless, since the load could be delayed  */
     /* beyond it.                                                         */
     AO_nop_read();
     return result;
   }
#  define AO_HAVE_int_load_read
#endif

#if defined(AO_HAVE_int_load_acquire) && defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_int_load_full)
#  define AO_int_load_full(addr) (AO_nop_full(), AO_int_load_acquire(addr))
#  define AO_HAVE_int_load_full
#endif

#if !defined(AO_HAVE_int_load_acquire_read) && defined(AO_HAVE_int_load_read)
#  define AO_int_load_acquire_read(addr) AO_int_load_read(addr)
#  define AO_HAVE_int_load_acquire_read
#endif

#if defined(AO_HAVE_int_load_acquire_read) && !defined(AO_HAVE_int_load)
#  define AO_int_load(addr) AO_int_load_acquire_read(addr)
#  define AO_HAVE_int_load
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_int_load_acquire_read)
#    define AO_int_load_dd_acquire_read(addr) \
        AO_int_load_acquire_read(addr)
#    define AO_HAVE_int_load_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_int_load)
#    define AO_int_load_dd_acquire_read(addr) \
        AO_int_load(addr)
#    define AO_HAVE_int_load_dd_acquire_read
#  endif
#endif


/* int_store */

#if defined(AO_HAVE_int_store_release) && !defined(AO_HAVE_int_store)
#  define AO_int_store(addr, val) AO_int_store_release(addr,val)
#  define AO_HAVE_int_store
#endif

#if defined(AO_HAVE_int_store_full) && !defined(AO_HAVE_int_store_release)
#  define AO_int_store_release(addr,val) AO_int_store_full(addr,val)
#  define AO_HAVE_int_store_release
#endif

#if defined(AO_HAVE_int_store_full) && !defined(AO_HAVE_int_store_write)
#  define AO_int_store_write(addr,val) AO_int_store_full(addr,val)
#  define AO_HAVE_int_store_write
#endif

#if defined(AO_HAVE_int_store_release) && \
        !defined(AO_HAVE_int_store_release_write)
#  define AO_int_store_release_write(addr, val) \
        AO_int_store_release(addr,val)
#  define AO_HAVE_int_store_release_write
#endif

#if defined(AO_HAVE_int_store_write) && !defined(AO_HAVE_int_store)
#  define AO_int_store(addr, val) AO_int_store_write(addr,val)
#  define AO_HAVE_int_store
#endif

#if defined(AO_HAVE_int_store) && defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_int_store_release)
#  define AO_int_store_release(addr,val) \
        (AO_nop_full(), AO_int_store(addr,val))
#  define AO_HAVE_int_store_release
#endif

#if defined(AO_HAVE_nop_write) && defined(AO_HAVE_int_store) && \
     !defined(AO_HAVE_int_store_write)
#  define AO_int_store_write(addr, val) \
        (AO_nop_write(), AO_int_store(addr,val))
#  define AO_HAVE_int_store_write
#endif

#if defined(AO_HAVE_int_store_write) && \
     !defined(AO_HAVE_int_store_release_write)
#  define AO_int_store_release_write(addr, val) AO_int_store_write(addr,val)
#  define AO_HAVE_int_store_release_write
#endif

#if defined(AO_HAVE_int_store_release) && defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_int_store_full)
#  define AO_int_store_full(addr, val) \
        (AO_int_store_release(addr, val), AO_nop_full())
#  define AO_HAVE_int_store_full
#endif


/* int_fetch_and_add */
#if defined(AO_HAVE_int_compare_and_swap_full) && \
    !defined(AO_HAVE_int_fetch_and_add_full)
   AO_INLINE AO_t
   AO_int_fetch_and_add_full(volatile unsigned int *addr,
                               unsigned int incr)
   {
     unsigned int old;
     do
       {
         old = *addr;
       }
     while (!AO_int_compare_and_swap_full(addr, old, old+incr));
     return old;
   }
#  define AO_HAVE_int_fetch_and_add_full
#endif

#if defined(AO_HAVE_int_compare_and_swap_acquire) && \
    !defined(AO_HAVE_int_fetch_and_add_acquire)
   AO_INLINE AO_t
   AO_int_fetch_and_add_acquire(volatile unsigned int *addr,
                                  unsigned int incr)
   {
     unsigned int old;
     do
       {
         old = *addr;
       }
     while (!AO_int_compare_and_swap_acquire(addr, old, old+incr));
     return old;
   }
#  define AO_HAVE_int_fetch_and_add_acquire
#endif

#if defined(AO_HAVE_int_compare_and_swap_release) && \
    !defined(AO_HAVE_int_fetch_and_add_release)
   AO_INLINE AO_t
   AO_int_fetch_and_add_release(volatile unsigned int *addr,
                                  unsigned int incr)
   {
     unsigned int old;
     do
       {
         old = *addr;
       }
     while (!AO_int_compare_and_swap_release(addr, old, old+incr));
     return old;
   }
#  define AO_HAVE_int_fetch_and_add_release
#endif

#if defined(AO_HAVE_int_fetch_and_add_full)
#  if !defined(AO_HAVE_int_fetch_and_add_release)
#    define AO_int_fetch_and_add_release(addr, val) \
         AO_int_fetch_and_add_full(addr, val)
#    define AO_HAVE_int_fetch_and_add_release
#  endif
#  if !defined(AO_HAVE_int_fetch_and_add_acquire)
#    define AO_int_fetch_and_add_acquire(addr, val) \
         AO_int_fetch_and_add_full(addr, val)
#    define AO_HAVE_int_fetch_and_add_acquire
#  endif
#  if !defined(AO_HAVE_int_fetch_and_add_write)
#    define AO_int_fetch_and_add_write(addr, val) \
         AO_int_fetch_and_add_full(addr, val)
#    define AO_HAVE_int_fetch_and_add_write
#  endif
#  if !defined(AO_HAVE_int_fetch_and_add_read)
#    define AO_int_fetch_and_add_read(addr, val) \
         AO_int_fetch_and_add_full(addr, val)
#    define AO_HAVE_int_fetch_and_add_read
#  endif
#endif /* AO_HAVE_int_fetch_and_add_full */

#if !defined(AO_HAVE_int_fetch_and_add) && \
    defined(AO_HAVE_int_fetch_and_add_release)
#  define AO_int_fetch_and_add(addr, val) \
        AO_int_fetch_and_add_release(addr, val)
#  define AO_HAVE_int_fetch_and_add
#endif
#if !defined(AO_HAVE_int_fetch_and_add) && \
    defined(AO_HAVE_int_fetch_and_add_acquire)
#  define AO_int_fetch_and_add(addr, val) \
        AO_int_fetch_and_add_acquire(addr, val)
#  define AO_HAVE_int_fetch_and_add
#endif
#if !defined(AO_HAVE_int_fetch_and_add) && \
    defined(AO_HAVE_int_fetch_and_add_write)
#  define AO_int_fetch_and_add(addr, val) \
        AO_int_fetch_and_add_write(addr, val)
#  define AO_HAVE_int_fetch_and_add
#endif
#if !defined(AO_HAVE_int_fetch_and_add) && \
    defined(AO_HAVE_int_fetch_and_add_read)
#  define AO_int_fetch_and_add(addr, val) \
        AO_int_fetch_and_add_read(addr, val)
#  define AO_HAVE_int_fetch_and_add
#endif

#if defined(AO_HAVE_int_fetch_and_add_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_int_fetch_and_add_full)
#  define AO_int_fetch_and_add_full(addr, val) \
        (AO_nop_full(), AO_int_fetch_and_add_acquire(addr, val))
#endif

#if !defined(AO_HAVE_int_fetch_and_add_release_write) && \
    defined(AO_HAVE_int_fetch_and_add_write)
#  define AO_int_fetch_and_add_release_write(addr, val) \
        AO_int_fetch_and_add_write(addr, val)
#  define AO_HAVE_int_fetch_and_add_release_write
#endif
#if !defined(AO_HAVE_int_fetch_and_add_release_write) && \
    defined(AO_HAVE_int_fetch_and_add_release)
#  define AO_int_fetch_and_add_release_write(addr, val) \
        AO_int_fetch_and_add_release(addr, val)
#  define AO_HAVE_int_fetch_and_add_release_write
#endif
#if !defined(AO_HAVE_int_fetch_and_add_acquire_read) && \
    defined(AO_HAVE_int_fetch_and_add_read)
#  define AO_int_fetch_and_add_acquire_read(addr, val) \
        AO_int_fetch_and_add_read(addr, val)
#  define AO_HAVE_int_fetch_and_add_acquire_read
#endif
#if !defined(AO_HAVE_int_fetch_and_add_acquire_read) && \
    defined(AO_HAVE_int_fetch_and_add_acquire)
#  define AO_int_fetch_and_add_acquire_read(addr, val) \
        AO_int_fetch_and_add_acquire(addr, val)
#  define AO_HAVE_int_fetch_and_add_acquire_read
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_int_fetch_and_add_acquire_read)
#    define AO_int_fetch_and_add_dd_acquire_read(addr, val) \
        AO_int_fetch_and_add_acquire_read(addr, val)
#    define AO_HAVE_int_fetch_and_add_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_int_fetch_and_add)
#    define AO_int_fetch_and_add_dd_acquire_read(addr, val) \
        AO_int_fetch_and_add(addr, val)
#    define AO_HAVE_int_fetch_and_add_dd_acquire_read
#  endif
#endif

/* int_fetch_and_add1 */

#if defined(AO_HAVE_int_fetch_and_add_full) &&\
    !defined(AO_HAVE_int_fetch_and_add1_full)
#  define AO_int_fetch_and_add1_full(addr) \
        AO_int_fetch_and_add_full(addr,1)
#  define AO_HAVE_int_fetch_and_add1_full
#endif
#if defined(AO_HAVE_int_fetch_and_add_release) &&\
    !defined(AO_HAVE_int_fetch_and_add1_release)
#  define AO_int_fetch_and_add1_release(addr) \
        AO_int_fetch_and_add_release(addr,1)
#  define AO_HAVE_int_fetch_and_add1_release
#endif
#if defined(AO_HAVE_int_fetch_and_add_acquire) &&\
    !defined(AO_HAVE_int_fetch_and_add1_acquire)
#  define AO_int_fetch_and_add1_acquire(addr) \
        AO_int_fetch_and_add_acquire(addr,1)
#  define AO_HAVE_int_fetch_and_add1_acquire
#endif
#if defined(AO_HAVE_int_fetch_and_add_write) &&\
    !defined(AO_HAVE_int_fetch_and_add1_write)
#  define AO_int_fetch_and_add1_write(addr) \
        AO_int_fetch_and_add_write(addr,1)
#  define AO_HAVE_int_fetch_and_add1_write
#endif
#if defined(AO_HAVE_int_fetch_and_add_read) &&\
    !defined(AO_HAVE_int_fetch_and_add1_read)
#  define AO_int_fetch_and_add1_read(addr) \
        AO_int_fetch_and_add_read(addr,1)
#  define AO_HAVE_int_fetch_and_add1_read
#endif
#if defined(AO_HAVE_int_fetch_and_add_release_write) &&\
    !defined(AO_HAVE_int_fetch_and_add1_release_write)
#  define AO_int_fetch_and_add1_release_write(addr) \
        AO_int_fetch_and_add_release_write(addr,1)
#  define AO_HAVE_int_fetch_and_add1_release_write
#endif
#if defined(AO_HAVE_int_fetch_and_add_acquire_read) &&\
    !defined(AO_HAVE_int_fetch_and_add1_acquire_read)
#  define AO_int_fetch_and_add1_acquire_read(addr) \
        AO_int_fetch_and_add_acquire_read(addr,1)
#  define AO_HAVE_int_fetch_and_add1_acquire_read
#endif
#if defined(AO_HAVE_int_fetch_and_add) &&\
    !defined(AO_HAVE_int_fetch_and_add1)
#  define AO_int_fetch_and_add1(addr) \
        AO_int_fetch_and_add(addr,1)
#  define AO_HAVE_int_fetch_and_add1
#endif

#if defined(AO_HAVE_int_fetch_and_add1_full)
#  if !defined(AO_HAVE_int_fetch_and_add1_release)
#    define AO_int_fetch_and_add1_release(addr) \
         AO_int_fetch_and_add1_full(addr)
#    define AO_HAVE_int_fetch_and_add1_release
#  endif
#  if !defined(AO_HAVE_int_fetch_and_add1_acquire)
#    define AO_int_fetch_and_add1_acquire(addr) \
         AO_int_fetch_and_add1_full(addr)
#    define AO_HAVE_int_fetch_and_add1_acquire
#  endif
#  if !defined(AO_HAVE_int_fetch_and_add1_write)
#    define AO_int_fetch_and_add1_write(addr) \
         AO_int_fetch_and_add1_full(addr)
#    define AO_HAVE_int_fetch_and_add1_write
#  endif
#  if !defined(AO_HAVE_int_fetch_and_add1_read)
#    define AO_int_fetch_and_add1_read(addr) \
         AO_int_fetch_and_add1_full(addr)
#    define AO_HAVE_int_fetch_and_add1_read
#  endif
#endif /* AO_HAVE_int_fetch_and_add1_full */

#if !defined(AO_HAVE_int_fetch_and_add1) && \
    defined(AO_HAVE_int_fetch_and_add1_release)
#  define AO_int_fetch_and_add1(addr) \
        AO_int_fetch_and_add1_release(addr)
#  define AO_HAVE_int_fetch_and_add1
#endif
#if !defined(AO_HAVE_int_fetch_and_add1) && \
    defined(AO_HAVE_int_fetch_and_add1_acquire)
#  define AO_int_fetch_and_add1(addr) \
        AO_int_fetch_and_add1_acquire(addr)
#  define AO_HAVE_int_fetch_and_add1
#endif
#if !defined(AO_HAVE_int_fetch_and_add1) && \
    defined(AO_HAVE_int_fetch_and_add1_write)
#  define AO_int_fetch_and_add1(addr) \
        AO_int_fetch_and_add1_write(addr)
#  define AO_HAVE_int_fetch_and_add1
#endif
#if !defined(AO_HAVE_int_fetch_and_add1) && \
    defined(AO_HAVE_int_fetch_and_add1_read)
#  define AO_int_fetch_and_add1(addr) \
        AO_int_fetch_and_add1_read(addr)
#  define AO_HAVE_int_fetch_and_add1
#endif

#if defined(AO_HAVE_int_fetch_and_add1_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_int_fetch_and_add1_full)
#  define AO_int_fetch_and_add1_full(addr) \
        (AO_nop_full(), AO_int_fetch_and_add1_acquire(addr))
#  define AO_HAVE_int_fetch_and_add1_full
#endif

#if !defined(AO_HAVE_int_fetch_and_add1_release_write) && \
    defined(AO_HAVE_int_fetch_and_add1_write)
#  define AO_int_fetch_and_add1_release_write(addr) \
        AO_int_fetch_and_add1_write(addr)
#  define AO_HAVE_int_fetch_and_add1_release_write
#endif
#if !defined(AO_HAVE_int_fetch_and_add1_release_write) && \
    defined(AO_HAVE_int_fetch_and_add1_release)
#  define AO_int_fetch_and_add1_release_write(addr) \
        AO_int_fetch_and_add1_release(addr)
#  define AO_HAVE_int_fetch_and_add1_release_write
#endif
#if !defined(AO_HAVE_int_fetch_and_add1_acquire_read) && \
    defined(AO_HAVE_int_fetch_and_add1_read)
#  define AO_int_fetch_and_add1_acquire_read(addr) \
        AO_int_fetch_and_add1_read(addr)
#  define AO_HAVE_int_fetch_and_add1_acquire_read
#endif
#if !defined(AO_HAVE_int_fetch_and_add1_acquire_read) && \
    defined(AO_HAVE_int_fetch_and_add1_acquire)
#  define AO_int_fetch_and_add1_acquire_read(addr) \
        AO_int_fetch_and_add1_acquire(addr)
#  define AO_HAVE_int_fetch_and_add1_acquire_read
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_int_fetch_and_add1_acquire_read)
#    define AO_int_fetch_and_add1_dd_acquire_read(addr) \
        AO_int_fetch_and_add1_acquire_read(addr)
#    define AO_HAVE_int_fetch_and_add1_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_int_fetch_and_add1)
#    define AO_int_fetch_and_add1_dd_acquire_read(addr) \
        AO_int_fetch_and_add1(addr)
#    define AO_HAVE_int_fetch_and_add1_dd_acquire_read
#  endif
#endif

/* int_fetch_and_sub1 */

#if defined(AO_HAVE_int_fetch_and_add_full) &&\
    !defined(AO_HAVE_int_fetch_and_sub1_full)
#  define AO_int_fetch_and_sub1_full(addr) \
        AO_int_fetch_and_add_full(addr,(unsigned int)(-1))
#  define AO_HAVE_int_fetch_and_sub1_full
#endif
#if defined(AO_HAVE_int_fetch_and_add_release) &&\
    !defined(AO_HAVE_int_fetch_and_sub1_release)
#  define AO_int_fetch_and_sub1_release(addr) \
        AO_int_fetch_and_add_release(addr,(unsigned int)(-1))
#  define AO_HAVE_int_fetch_and_sub1_release
#endif
#if defined(AO_HAVE_int_fetch_and_add_acquire) &&\
    !defined(AO_HAVE_int_fetch_and_sub1_acquire)
#  define AO_int_fetch_and_sub1_acquire(addr) \
        AO_int_fetch_and_add_acquire(addr,(unsigned int)(-1))
#  define AO_HAVE_int_fetch_and_sub1_acquire
#endif
#if defined(AO_HAVE_int_fetch_and_add_write) &&\
    !defined(AO_HAVE_int_fetch_and_sub1_write)
#  define AO_int_fetch_and_sub1_write(addr) \
        AO_int_fetch_and_add_write(addr,(unsigned int)(-1))
#  define AO_HAVE_int_fetch_and_sub1_write
#endif
#if defined(AO_HAVE_int_fetch_and_add_read) &&\
    !defined(AO_HAVE_int_fetch_and_sub1_read)
#  define AO_int_fetch_and_sub1_read(addr) \
        AO_int_fetch_and_add_read(addr,(unsigned int)(-1))
#  define AO_HAVE_int_fetch_and_sub1_read
#endif
#if defined(AO_HAVE_int_fetch_and_add_release_write) &&\
    !defined(AO_HAVE_int_fetch_and_sub1_release_write)
#  define AO_int_fetch_and_sub1_release_write(addr) \
        AO_int_fetch_and_add_release_write(addr,(unsigned int)(-1))
#  define AO_HAVE_int_fetch_and_sub1_release_write
#endif
#if defined(AO_HAVE_int_fetch_and_add_acquire_read) &&\
    !defined(AO_HAVE_int_fetch_and_sub1_acquire_read)
#  define AO_int_fetch_and_sub1_acquire_read(addr) \
        AO_int_fetch_and_add_acquire_read(addr,(unsigned int)(-1))
#  define AO_HAVE_int_fetch_and_sub1_acquire_read
#endif
#if defined(AO_HAVE_int_fetch_and_add) &&\
    !defined(AO_HAVE_int_fetch_and_sub1)
#  define AO_int_fetch_and_sub1(addr) \
        AO_int_fetch_and_add(addr,(unsigned int)(-1))
#  define AO_HAVE_int_fetch_and_sub1
#endif

#if defined(AO_HAVE_int_fetch_and_sub1_full)
#  if !defined(AO_HAVE_int_fetch_and_sub1_release)
#    define AO_int_fetch_and_sub1_release(addr) \
         AO_int_fetch_and_sub1_full(addr)
#    define AO_HAVE_int_fetch_and_sub1_release
#  endif
#  if !defined(AO_HAVE_int_fetch_and_sub1_acquire)
#    define AO_int_fetch_and_sub1_acquire(addr) \
         AO_int_fetch_and_sub1_full(addr)
#    define AO_HAVE_int_fetch_and_sub1_acquire
#  endif
#  if !defined(AO_HAVE_int_fetch_and_sub1_write)
#    define AO_int_fetch_and_sub1_write(addr) \
         AO_int_fetch_and_sub1_full(addr)
#    define AO_HAVE_int_fetch_and_sub1_write
#  endif
#  if !defined(AO_HAVE_int_fetch_and_sub1_read)
#    define AO_int_fetch_and_sub1_read(addr) \
         AO_int_fetch_and_sub1_full(addr)
#    define AO_HAVE_int_fetch_and_sub1_read
#  endif
#endif /* AO_HAVE_int_fetch_and_sub1_full */

#if !defined(AO_HAVE_int_fetch_and_sub1) && \
    defined(AO_HAVE_int_fetch_and_sub1_release)
#  define AO_int_fetch_and_sub1(addr) \
        AO_int_fetch_and_sub1_release(addr)
#  define AO_HAVE_int_fetch_and_sub1
#endif
#if !defined(AO_HAVE_int_fetch_and_sub1) && \
    defined(AO_HAVE_int_fetch_and_sub1_acquire)
#  define AO_int_fetch_and_sub1(addr) \
        AO_int_fetch_and_sub1_acquire(addr)
#  define AO_HAVE_int_fetch_and_sub1
#endif
#if !defined(AO_HAVE_int_fetch_and_sub1) && \
    defined(AO_HAVE_int_fetch_and_sub1_write)
#  define AO_int_fetch_and_sub1(addr) \
        AO_int_fetch_and_sub1_write(addr)
#  define AO_HAVE_int_fetch_and_sub1
#endif
#if !defined(AO_HAVE_int_fetch_and_sub1) && \
    defined(AO_HAVE_int_fetch_and_sub1_read)
#  define AO_int_fetch_and_sub1(addr) \
        AO_int_fetch_and_sub1_read(addr)
#  define AO_HAVE_int_fetch_and_sub1
#endif

#if defined(AO_HAVE_int_fetch_and_sub1_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_int_fetch_and_sub1_full)
#  define AO_int_fetch_and_sub1_full(addr) \
        (AO_nop_full(), AO_int_fetch_and_sub1_acquire(addr))
#  define AO_HAVE_int_fetch_and_sub1_full
#endif

#if !defined(AO_HAVE_int_fetch_and_sub1_release_write) && \
    defined(AO_HAVE_int_fetch_and_sub1_write)
#  define AO_int_fetch_and_sub1_release_write(addr) \
        AO_int_fetch_and_sub1_write(addr)
#  define AO_HAVE_int_fetch_and_sub1_release_write
#endif
#if !defined(AO_HAVE_int_fetch_and_sub1_release_write) && \
    defined(AO_HAVE_int_fetch_and_sub1_release)
#  define AO_int_fetch_and_sub1_release_write(addr) \
        AO_int_fetch_and_sub1_release(addr)
#  define AO_HAVE_int_fetch_and_sub1_release_write
#endif
#if !defined(AO_HAVE_int_fetch_and_sub1_acquire_read) && \
    defined(AO_HAVE_int_fetch_and_sub1_read)
#  define AO_int_fetch_and_sub1_acquire_read(addr) \
        AO_int_fetch_and_sub1_read(addr)
#  define AO_HAVE_int_fetch_and_sub1_acquire_read
#endif
#if !defined(AO_HAVE_int_fetch_and_sub1_acquire_read) && \
    defined(AO_HAVE_int_fetch_and_sub1_acquire)
#  define AO_int_fetch_and_sub1_acquire_read(addr) \
        AO_int_fetch_and_sub1_acquire(addr)
#  define AO_HAVE_int_fetch_and_sub1_acquire_read
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_int_fetch_and_sub1_acquire_read)
#    define AO_int_fetch_and_sub1_dd_acquire_read(addr) \
        AO_int_fetch_and_sub1_acquire_read(addr)
#    define AO_HAVE_int_fetch_and_sub1_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_int_fetch_and_sub1)
#    define AO_int_fetch_and_sub1_dd_acquire_read(addr) \
        AO_int_fetch_and_sub1(addr)
#    define AO_HAVE_int_fetch_and_sub1_dd_acquire_read
#  endif
#endif


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/generalize.h
================================================
/*
 * Copyright (c) 2003-2004 Hewlett-Packard Development Company, L.P.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

/*
 * Generalize atomic operations for atomic_ops.h.
 * Should not be included directly.
 *
 * We make no attempt to define useless operations, such as
 * AO_nop_acquire
 * AO_nop_release
 *
 * We have also so far neglected to define some others, which
 * do not appear likely to be useful, e.g. stores with acquire
 * or read barriers.
 *
 * This file is sometimes included twice by atomic_ops.h.
 * All definitions include explicit checks that we are not replacing
 * an earlier definition.  In general, more desirable expansions
 * appear earlier so that we are more likely to use them.
 *
 * We only make safe generalizations, except that by default we define
 * the ...dd_acquire_read operations to be equivalent to those without
 * a barrier.  On platforms for which this is unsafe, the platform-specific
 * file must define AO_NO_DD_ORDERING.
 */

#ifndef ATOMIC_OPS_H
# error Atomic_ops_generalize.h should not be included directly.
#endif

#if AO_CHAR_TS_T
# define AO_TS_COMPARE_AND_SWAP_FULL(a,o,n) \
         AO_char_compare_and_swap_full(a,o,n)
# define AO_TS_COMPARE_AND_SWAP_ACQUIRE(a,o,n) \
         AO_char_compare_and_swap_acquire(a,o,n)
# define AO_TS_COMPARE_AND_SWAP_RELEASE(a,o,n) \
         AO_char_compare_and_swap_release(a,o,n)
# define AO_TS_COMPARE_AND_SWAP(a,o,n) \
         AO_char_compare_and_swap(a,o,n)
#endif

#if AO_AO_TS_T
# define AO_TS_COMPARE_AND_SWAP_FULL(a,o,n) \
         AO_compare_and_swap_full(a,o,n)
# define AO_TS_COMPARE_AND_SWAP_ACQUIRE(a,o,n) \
         AO_compare_and_swap_acquire(a,o,n)
# define AO_TS_COMPARE_AND_SWAP_RELEASE(a,o,n) \
         AO_compare_and_swap_release(a,o,n)
# define AO_TS_COMPARE_AND_SWAP(a,o,n) \
         AO_compare_and_swap(a,o,n)
#endif

/* Generate test_and_set_full, if necessary and possible.       */
#if !defined(AO_HAVE_test_and_set) && \
    !defined(AO_HAVE_test_and_set_release) && \
    !defined(AO_HAVE_test_and_set_acquire) && \
    !defined(AO_HAVE_test_and_set_read) && \
    !defined(AO_HAVE_test_and_set_full)
#  if AO_AO_TS_T && defined(AO_HAVE_compare_and_swap_full) || \
      AO_CHAR_TS_T && defined(AO_HAVE_char_compare_and_swap_full)
     AO_INLINE AO_TS_VAL_t
     AO_test_and_set_full(volatile AO_TS_t *addr)
     {
       if (AO_TS_COMPARE_AND_SWAP_FULL(addr, AO_TS_CLEAR, AO_TS_SET))
         return AO_TS_CLEAR;
       else
         return AO_TS_SET;
     }
#    define AO_HAVE_test_and_set_full
#  endif /* AO_HAVE_compare_and_swap_full */

#  if AO_AO_TS_T && defined(AO_HAVE_compare_and_swap_acquire) || \
      AO_CHAR_TS_T && defined(AO_HAVE_char_compare_and_swap_acquire)
     AO_INLINE AO_TS_VAL_t
     AO_test_and_set_acquire(volatile AO_TS_t *addr)
     {
       if (AO_TS_COMPARE_AND_SWAP_ACQUIRE(addr, AO_TS_CLEAR, AO_TS_SET))
         return AO_TS_CLEAR;
       else
         return AO_TS_SET;
     }
#    define AO_HAVE_test_and_set_acquire
#  endif /* AO_HAVE_compare_and_swap_acquire */

#  if AO_AO_TS_T && defined(AO_HAVE_compare_and_swap_release) || \
      AO_CHAR_TS_T && defined(AO_HAVE_char_compare_and_swap_release)
     AO_INLINE AO_TS_VAL_t
     AO_test_and_set_release(volatile AO_TS_t *addr)
     {
       if (AO_TS_COMPARE_AND_SWAP_RELEASE(addr, AO_TS_CLEAR, AO_TS_SET))
         return AO_TS_CLEAR;
       else
         return AO_TS_SET;
     }
#    define AO_HAVE_test_and_set_release
#  endif /* AO_HAVE_compare_and_swap_release */

#  if AO_AO_TS_T && defined(AO_HAVE_compare_and_swap) || \
      AO_CHAR_TS_T && defined(AO_HAVE_char_compare_and_swap)
     AO_INLINE AO_TS_VAL_t
     AO_test_and_set(volatile AO_TS_t *addr)
     {
       if (AO_TS_COMPARE_AND_SWAP(addr, AO_TS_CLEAR, AO_TS_SET))
         return AO_TS_CLEAR;
       else
         return AO_TS_SET;
     }
#    define AO_HAVE_test_and_set
#  endif /* AO_HAVE_compare_and_swap */

#  if defined(AO_HAVE_test_and_set) && defined(AO_HAVE_nop_full) \
      && !defined(AO_HAVE_test_and_set_acquire)
     AO_INLINE AO_TS_VAL_t
     AO_test_and_set_acquire(volatile AO_TS_t *addr)
     {
       AO_TS_VAL_t result = AO_test_and_set(addr);
       AO_nop_full();
       return result;
     }
#    define AO_HAVE_test_and_set_acquire
#  endif

#endif /* No prior test and set */

/* Nop */
#if !defined(AO_HAVE_nop)
   AO_INLINE void AO_nop(void) {}
#  define AO_HAVE_nop
#endif

#if defined(AO_HAVE_test_and_set_full) && !defined(AO_HAVE_nop_full)
   AO_INLINE void
   AO_nop_full(void)
   {
     AO_TS_t dummy = AO_TS_INITIALIZER;
     AO_test_and_set_full(&dummy);
   }
#  define AO_HAVE_nop_full
#endif

#if defined(AO_HAVE_nop_acquire)
#  error AO_nop_acquire is useless: dont define.
#endif
#if defined(AO_HAVE_nop_release)
#  error AO_nop_release is useless: dont define.
#endif

#if defined(AO_HAVE_nop_full) && !defined(AO_HAVE_nop_read)
#  define AO_nop_read() AO_nop_full()
#  define AO_HAVE_nop_read
#endif

#if defined(AO_HAVE_nop_full) && !defined(AO_HAVE_nop_write)
#  define AO_nop_write() AO_nop_full()
#  define AO_HAVE_nop_write
#endif

/* Load */
#if defined(AO_HAVE_load_full) && !defined(AO_HAVE_load_acquire)
#  define AO_load_acquire(addr) AO_load_full(addr)
#  define AO_HAVE_load_acquire
#endif

#if defined(AO_HAVE_load_acquire) && !defined(AO_HAVE_load)
#  define AO_load(addr) AO_load_acquire(addr)
#  define AO_HAVE_load
#endif

#if defined(AO_HAVE_load_full) && !defined(AO_HAVE_load_read)
#  define AO_load_read(addr) AO_load_full(addr)
#  define AO_HAVE_load_read
#endif

#if !defined(AO_HAVE_load_acquire_read) && defined(AO_HAVE_load_acquire)
#  define AO_load_acquire_read(addr) AO_load_acquire(addr)
#  define AO_HAVE_load_acquire_read
#endif

#if defined(AO_HAVE_load) && defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_load_acquire)
   AO_INLINE AO_t
   AO_load_acquire(const volatile AO_t *addr)
   {
     AO_t result = AO_load(addr);
     /* Acquire barrier would be useless, since the load could be delayed  */
     /* beyond it.                                                         */
     AO_nop_full();
     return result;
   }
#  define AO_HAVE_load_acquire
#endif

#if defined(AO_HAVE_load) && defined(AO_HAVE_nop_read) && \
    !defined(AO_HAVE_load_read)
   AO_INLINE AO_t
   AO_load_read(const volatile AO_t *addr)
   {
     AO_t result = AO_load(addr);
     /* Acquire barrier would be useless, since the load could be delayed  */
     /* beyond it.                                                         */
     AO_nop_read();
     return result;
   }
#  define AO_HAVE_load_read
#endif

#if defined(AO_HAVE_load_acquire) && defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_load_full)
#  define AO_load_full(addr) (AO_nop_full(), AO_load_acquire(addr))
#  define AO_HAVE_load_full
#endif

#if !defined(AO_HAVE_load_acquire_read) && defined(AO_HAVE_load_read)
#  define AO_load_acquire_read(addr) AO_load_read(addr)
#  define AO_HAVE_load_acquire_read
#endif

#if defined(AO_HAVE_load_acquire_read) && !defined(AO_HAVE_load)
#  define AO_load(addr) AO_load_acquire_read(addr)
#  define AO_HAVE_load
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_load_acquire_read)
#    define AO_load_dd_acquire_read(addr) AO_load_acquire_read(addr)
#    define AO_HAVE_load_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_load)
#    define AO_load_dd_acquire_read(addr) AO_load(addr)
#    define AO_HAVE_load_dd_acquire_read
#  endif
#endif


/* Store */

#if defined(AO_HAVE_store_full) && !defined(AO_HAVE_store_release)
#  define AO_store_release(addr,val) AO_store_full(addr,val)
#  define AO_HAVE_store_release
#endif

#if defined(AO_HAVE_store_release) && !defined(AO_HAVE_store)
#  define AO_store(addr, val) AO_store_release(addr,val)
#  define AO_HAVE_store
#endif

#if defined(AO_HAVE_store_full) && !defined(AO_HAVE_store_write)
#  define AO_store_write(addr,val) AO_store_full(addr,val)
#  define AO_HAVE_store_write
#endif

#if defined(AO_HAVE_store_release) && !defined(AO_HAVE_store_release_write)
#  define AO_store_release_write(addr, val) AO_store_release(addr,val)
#  define AO_HAVE_store_release_write
#endif

#if defined(AO_HAVE_store_write) && !defined(AO_HAVE_store)
#  define AO_store(addr, val) AO_store_write(addr,val)
#  define AO_HAVE_store
#endif

#if defined(AO_HAVE_store) && defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_store_release)
#  define AO_store_release(addr,val) (AO_nop_full(), AO_store(addr,val))
#  define AO_HAVE_store_release
#endif

#if defined(AO_HAVE_nop_write) && defined(AO_HAVE_store) && \
     !defined(AO_HAVE_store_write)
#  define AO_store_write(addr, val) (AO_nop_write(), AO_store(addr,val))
#  define AO_HAVE_store_write
#endif

#if defined(AO_HAVE_store_write) && !defined(AO_HAVE_store_release_write)
#  define AO_store_release_write(addr, val) AO_store_write(addr,val)
#  define AO_HAVE_store_release_write
#endif

#if defined(AO_HAVE_store_release) && defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_store_full)
#  define AO_store_full(addr, val) (AO_store_release(addr, val), AO_nop_full())
#  define AO_HAVE_store_full
#endif

/* NEC LE-IT: Test and set */
#if defined(AO_HAVE_test_and_set) && \
        defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_test_and_set_release)
#       define AO_test_and_set_release(addr) \
        (AO_nop_full(), AO_test_and_set(addr))
#  define AO_HAVE_test_and_set_release
#endif

#if defined(AO_HAVE_test_and_set) && \
        defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_test_and_set_acquire)
AO_INLINE AO_TS_t
AO_test_and_set_acquire(volatile AO_TS_t *addr)
{
        AO_TS_t res = AO_test_and_set(addr);
        AO_nop_full();
        return res;
}
#  define AO_HAVE_test_and_set_acquire
#endif


/* Fetch_and_add */
/* We first try to implement fetch_and_add variants in terms    */
/* of the corresponding compare_and_swap variants to minimize   */
/* adding barriers.                                             */
#if defined(AO_HAVE_compare_and_swap_full) && \
    !defined(AO_HAVE_fetch_and_add_full)
   AO_INLINE AO_t
   AO_fetch_and_add_full(volatile AO_t *addr, AO_t incr)
   {
     AO_t old;
     do
       {
         old = *addr;
       }
     while (!AO_compare_and_swap_full(addr, old, old+incr));
     return old;
   }
#  define AO_HAVE_fetch_and_add_full
#endif

#if defined(AO_HAVE_compare_and_swap_acquire) && \
    !defined(AO_HAVE_fetch_and_add_acquire)
   AO_INLINE AO_t
   AO_fetch_and_add_acquire(volatile AO_t *addr, AO_t incr)
   {
     AO_t old;
     do
       {
         old = *addr;
       }
     while (!AO_compare_and_swap_acquire(addr, old, old+incr));
     return old;
   }
#  define AO_HAVE_fetch_and_add_acquire
#endif

#if defined(AO_HAVE_compare_and_swap_release) && \
    !defined(AO_HAVE_fetch_and_add_release)
   AO_INLINE AO_t
   AO_fetch_and_add_release(volatile AO_t *addr, AO_t incr)
   {
     AO_t old;
     do
       {
         old = *addr;
       }
     while (!AO_compare_and_swap_release(addr, old, old+incr));
     return old;
   }
#  define AO_HAVE_fetch_and_add_release
#endif

#if defined(AO_HAVE_compare_and_swap) && \
    !defined(AO_HAVE_fetch_and_add)
   AO_INLINE AO_t
   AO_fetch_and_add(volatile AO_t *addr, AO_t incr)
   {
     AO_t old;
     do
       {
         old = *addr;
       }
     while (!AO_compare_and_swap(addr, old, old+incr));
     return old;
   }
#  define AO_HAVE_fetch_and_add
#endif

#if defined(AO_HAVE_fetch_and_add_full)
#  if !defined(AO_HAVE_fetch_and_add_release)
#    define AO_fetch_and_add_release(addr, val) \
         AO_fetch_and_add_full(addr, val)
#    define AO_HAVE_fetch_and_add_release
#  endif
#  if !defined(AO_HAVE_fetch_and_add_acquire)
#    define AO_fetch_and_add_acquire(addr, val) \
         AO_fetch_and_add_full(addr, val)
#    define AO_HAVE_fetch_and_add_acquire
#  endif
#  if !defined(AO_HAVE_fetch_and_add_write)
#    define AO_fetch_and_add_write(addr, val) \
         AO_fetch_and_add_full(addr, val)
#    define AO_HAVE_fetch_and_add_write
#  endif
#  if !defined(AO_HAVE_fetch_and_add_read)
#    define AO_fetch_and_add_read(addr, val) \
         AO_fetch_and_add_full(addr, val)
#    define AO_HAVE_fetch_and_add_read
#  endif
#endif /* AO_HAVE_fetch_and_add_full */

#if !defined(AO_HAVE_fetch_and_add) && \
    defined(AO_HAVE_fetch_and_add_release)
#  define AO_fetch_and_add(addr, val) \
        AO_fetch_and_add_release(addr, val)
#  define AO_HAVE_fetch_and_add
#endif
#if !defined(AO_HAVE_fetch_and_add) && \
    defined(AO_HAVE_fetch_and_add_acquire)
#  define AO_fetch_and_add(addr, val) \
        AO_fetch_and_add_acquire(addr, val)
#  define AO_HAVE_fetch_and_add
#endif
#if !defined(AO_HAVE_fetch_and_add) && \
    defined(AO_HAVE_fetch_and_add_write)
#  define AO_fetch_and_add(addr, val) \
        AO_fetch_and_add_write(addr, val)
#  define AO_HAVE_fetch_and_add
#endif
#if !defined(AO_HAVE_fetch_and_add) && \
    defined(AO_HAVE_fetch_and_add_read)
#  define AO_fetch_and_add(addr, val) \
        AO_fetch_and_add_read(addr, val)
#  define AO_HAVE_fetch_and_add
#endif

#if defined(AO_HAVE_fetch_and_add_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_fetch_and_add_full)
#  define AO_fetch_and_add_full(addr, val) \
        (AO_nop_full(), AO_fetch_and_add_acquire(addr, val))
#  define AO_HAVE_fetch_and_add_full
#endif

#if !defined(AO_HAVE_fetch_and_add_release_write) && \
    defined(AO_HAVE_fetch_and_add_write)
#  define AO_fetch_and_add_release_write(addr, val) \
        AO_fetch_and_add_write(addr, val)
#  define AO_HAVE_fetch_and_add_release_write
#endif
#if !defined(AO_HAVE_fetch_and_add_release_write) && \
    defined(AO_HAVE_fetch_and_add_release)
#  define AO_fetch_and_add_release_write(addr, val) \
        AO_fetch_and_add_release(addr, val)
#  define AO_HAVE_fetch_and_add_release_write
#endif
#if !defined(AO_HAVE_fetch_and_add_acquire_read) && \
    defined(AO_HAVE_fetch_and_add_read)
#  define AO_fetch_and_add_acquire_read(addr, val) \
        AO_fetch_and_add_read(addr, val)
#  define AO_HAVE_fetch_and_add_acquire_read
#endif
#if !defined(AO_HAVE_fetch_and_add_acquire_read) && \
    defined(AO_HAVE_fetch_and_add_acquire)
#  define AO_fetch_and_add_acquire_read(addr, val) \
        AO_fetch_and_add_acquire(addr, val)
#  define AO_HAVE_fetch_and_add_acquire_read
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_fetch_and_add_acquire_read)
#    define AO_fetch_and_add_dd_acquire_read(addr, val) \
        AO_fetch_and_add_acquire_read(addr, val)
#    define AO_HAVE_fetch_and_add_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_fetch_and_add)
#    define AO_fetch_and_add_dd_acquire_read(addr, val) \
        AO_fetch_and_add(addr, val)
#    define AO_HAVE_fetch_and_add_dd_acquire_read
#  endif
#endif

/* Fetch_and_add1 */

#if defined(AO_HAVE_fetch_and_add_full) &&\
    !defined(AO_HAVE_fetch_and_add1_full)
#  define AO_fetch_and_add1_full(addr) AO_fetch_and_add_full(addr,1)
#  define AO_HAVE_fetch_and_add1_full
#endif
#if defined(AO_HAVE_fetch_and_add_release) &&\
    !defined(AO_HAVE_fetch_and_add1_release)
#  define AO_fetch_and_add1_release(addr) AO_fetch_and_add_release(addr,1)
#  define AO_HAVE_fetch_and_add1_release
#endif
#if defined(AO_HAVE_fetch_and_add_acquire) &&\
    !defined(AO_HAVE_fetch_and_add1_acquire)
#  define AO_fetch_and_add1_acquire(addr) AO_fetch_and_add_acquire(addr,1)
#  define AO_HAVE_fetch_and_add1_acquire
#endif
#if defined(AO_HAVE_fetch_and_add_write) &&\
    !defined(AO_HAVE_fetch_and_add1_write)
#  define AO_fetch_and_add1_write(addr) AO_fetch_and_add_write(addr,1)
#  define AO_HAVE_fetch_and_add1_write
#endif
#if defined(AO_HAVE_fetch_and_add_read) &&\
    !defined(AO_HAVE_fetch_and_add1_read)
#  define AO_fetch_and_add1_read(addr) AO_fetch_and_add_read(addr,1)
#  define AO_HAVE_fetch_and_add1_read
#endif
#if defined(AO_HAVE_fetch_and_add_release_write) &&\
    !defined(AO_HAVE_fetch_and_add1_release_write)
#  define AO_fetch_and_add1_release_write(addr) \
        AO_fetch_and_add_release_write(addr,1)
#  define AO_HAVE_fetch_and_add1_release_write
#endif
#if defined(AO_HAVE_fetch_and_add_acquire_read) &&\
    !defined(AO_HAVE_fetch_and_add1_acquire_read)
#  define AO_fetch_and_add1_acquire_read(addr) \
        AO_fetch_and_add_acquire_read(addr,1)
#  define AO_HAVE_fetch_and_add1_acquire_read
#endif
#if defined(AO_HAVE_fetch_and_add) &&\
    !defined(AO_HAVE_fetch_and_add1)
#  define AO_fetch_and_add1(addr) \
        AO_fetch_and_add(addr,1)
#  define AO_HAVE_fetch_and_add1
#endif

#if defined(AO_HAVE_fetch_and_add1_full)
#  if !defined(AO_HAVE_fetch_and_add1_release)
#    define AO_fetch_and_add1_release(addr) \
         AO_fetch_and_add1_full(addr)
#    define AO_HAVE_fetch_and_add1_release
#  endif
#  if !defined(AO_HAVE_fetch_and_add1_acquire)
#    define AO_fetch_and_add1_acquire(addr) \
         AO_fetch_and_add1_full(addr)
#    define AO_HAVE_fetch_and_add1_acquire
#  endif
#  if !defined(AO_HAVE_fetch_and_add1_write)
#    define AO_fetch_and_add1_write(addr) \
         AO_fetch_and_add1_full(addr)
#    define AO_HAVE_fetch_and_add1_write
#  endif
#  if !defined(AO_HAVE_fetch_and_add1_read)
#    define AO_fetch_and_add1_read(addr) \
         AO_fetch_and_add1_full(addr)
#    define AO_HAVE_fetch_and_add1_read
#  endif
#endif /* AO_HAVE_fetch_and_add1_full */

#if !defined(AO_HAVE_fetch_and_add1) && \
    defined(AO_HAVE_fetch_and_add1_release)
#  define AO_fetch_and_add1(addr) \
        AO_fetch_and_add1_release(addr)
#  define AO_HAVE_fetch_and_add1
#endif
#if !defined(AO_HAVE_fetch_and_add1) && \
    defined(AO_HAVE_fetch_and_add1_acquire)
#  define AO_fetch_and_add1(addr) \
        AO_fetch_and_add1_acquire(addr)
#  define AO_HAVE_fetch_and_add1
#endif
#if !defined(AO_HAVE_fetch_and_add1) && \
    defined(AO_HAVE_fetch_and_add1_write)
#  define AO_fetch_and_add1(addr) \
        AO_fetch_and_add1_write(addr)
#  define AO_HAVE_fetch_and_add1
#endif
#if !defined(AO_HAVE_fetch_and_add1) && \
    defined(AO_HAVE_fetch_and_add1_read)
#  define AO_fetch_and_add1(addr) \
        AO_fetch_and_add1_read(addr)
#  define AO_HAVE_fetch_and_add1
#endif

#if defined(AO_HAVE_fetch_and_add1_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_fetch_and_add1_full)
#  define AO_fetch_and_add1_full(addr) \
        (AO_nop_full(), AO_fetch_and_add1_acquire(addr))
#  define AO_HAVE_fetch_and_add1_full
#endif

#if !defined(AO_HAVE_fetch_and_add1_release_write) && \
    defined(AO_HAVE_fetch_and_add1_write)
#  define AO_fetch_and_add1_release_write(addr) \
        AO_fetch_and_add1_write(addr)
#  define AO_HAVE_fetch_and_add1_release_write
#endif
#if !defined(AO_HAVE_fetch_and_add1_release_write) && \
    defined(AO_HAVE_fetch_and_add1_release)
#  define AO_fetch_and_add1_release_write(addr) \
        AO_fetch_and_add1_release(addr)
#  define AO_HAVE_fetch_and_add1_release_write
#endif
#if !defined(AO_HAVE_fetch_and_add1_acquire_read) && \
    defined(AO_HAVE_fetch_and_add1_read)
#  define AO_fetch_and_add1_acquire_read(addr) \
        AO_fetch_and_add1_read(addr)
#  define AO_HAVE_fetch_and_add1_acquire_read
#endif
#if !defined(AO_HAVE_fetch_and_add1_acquire_read) && \
    defined(AO_HAVE_fetch_and_add1_acquire)
#  define AO_fetch_and_add1_acquire_read(addr) \
        AO_fetch_and_add1_acquire(addr)
#  define AO_HAVE_fetch_and_add1_acquire_read
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_fetch_and_add1_acquire_read)
#    define AO_fetch_and_add1_dd_acquire_read(addr) \
        AO_fetch_and_add1_acquire_read(addr)
#    define AO_HAVE_fetch_and_add1_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_fetch_and_add1)
#    define AO_fetch_and_add1_dd_acquire_read(addr) AO_fetch_and_add1(addr)
#    define AO_HAVE_fetch_and_add1_dd_acquire_read
#  endif
#endif

/* Fetch_and_sub1 */

#if defined(AO_HAVE_fetch_and_add_full) &&\
    !defined(AO_HAVE_fetch_and_sub1_full)
#  define AO_fetch_and_sub1_full(addr) AO_fetch_and_add_full(addr,(AO_t)(-1))
#  define AO_HAVE_fetch_and_sub1_full
#endif
#if defined(AO_HAVE_fetch_and_add_release) &&\
    !defined(AO_HAVE_fetch_and_sub1_release)
#  define AO_fetch_and_sub1_release(addr) \
        AO_fetch_and_add_release(addr,(AO_t)(-1))
#  define AO_HAVE_fetch_and_sub1_release
#endif
#if defined(AO_HAVE_fetch_and_add_acquire) &&\
    !defined(AO_HAVE_fetch_and_sub1_acquire)
#  define AO_fetch_and_sub1_acquire(addr) \
        AO_fetch_and_add_acquire(addr,(AO_t)(-1))
#  define AO_HAVE_fetch_and_sub1_acquire
#endif
#if defined(AO_HAVE_fetch_and_add_write) &&\
    !defined(AO_HAVE_fetch_and_sub1_write)
#  define AO_fetch_and_sub1_write(addr) \
        AO_fetch_and_add_write(addr,(AO_t)(-1))
#  define AO_HAVE_fetch_and_sub1_write
#endif
#if defined(AO_HAVE_fetch_and_add_read) &&\
    !defined(AO_HAVE_fetch_and_sub1_read)
#  define AO_fetch_and_sub1_read(addr) \
        AO_fetch_and_add_read(addr,(AO_t)(-1))
#  define AO_HAVE_fetch_and_sub1_read
#endif
#if defined(AO_HAVE_fetch_and_add_release_write) &&\
    !defined(AO_HAVE_fetch_and_sub1_release_write)
#  define AO_fetch_and_sub1_release_write(addr) \
        AO_fetch_and_add_release_write(addr,(AO_t)(-1))
#  define AO_HAVE_fetch_and_sub1_release_write
#endif
#if defined(AO_HAVE_fetch_and_add_acquire_read) &&\
    !defined(AO_HAVE_fetch_and_sub1_acquire_read)
#  define AO_fetch_and_sub1_acquire_read(addr) \
        AO_fetch_and_add_acquire_read(addr,(AO_t)(-1))
#  define AO_HAVE_fetch_and_sub1_acquire_read
#endif
#if defined(AO_HAVE_fetch_and_add) &&\
    !defined(AO_HAVE_fetch_and_sub1)
#  define AO_fetch_and_sub1(addr) \
        AO_fetch_and_add(addr,(AO_t)(-1))
#  define AO_HAVE_fetch_and_sub1
#endif

#if defined(AO_HAVE_fetch_and_sub1_full)
#  if !defined(AO_HAVE_fetch_and_sub1_release)
#    define AO_fetch_and_sub1_release(addr) \
         AO_fetch_and_sub1_full(addr)
#    define AO_HAVE_fetch_and_sub1_release
#  endif
#  if !defined(AO_HAVE_fetch_and_sub1_acquire)
#    define AO_fetch_and_sub1_acquire(addr) \
         AO_fetch_and_sub1_full(addr)
#    define AO_HAVE_fetch_and_sub1_acquire
#  endif
#  if !defined(AO_HAVE_fetch_and_sub1_write)
#    define AO_fetch_and_sub1_write(addr) \
         AO_fetch_and_sub1_full(addr)
#    define AO_HAVE_fetch_and_sub1_write
#  endif
#  if !defined(AO_HAVE_fetch_and_sub1_read)
#    define AO_fetch_and_sub1_read(addr) \
         AO_fetch_and_sub1_full(addr)
#    define AO_HAVE_fetch_and_sub1_read
#  endif
#endif /* AO_HAVE_fetch_and_sub1_full */

#if !defined(AO_HAVE_fetch_and_sub1) && \
    defined(AO_HAVE_fetch_and_sub1_release)
#  define AO_fetch_and_sub1(addr) \
        AO_fetch_and_sub1_release(addr)
#  define AO_HAVE_fetch_and_sub1
#endif
#if !defined(AO_HAVE_fetch_and_sub1) && \
    defined(AO_HAVE_fetch_and_sub1_acquire)
#  define AO_fetch_and_sub1(addr) \
        AO_fetch_and_sub1_acquire(addr)
#  define AO_HAVE_fetch_and_sub1
#endif
#if !defined(AO_HAVE_fetch_and_sub1) && \
    defined(AO_HAVE_fetch_and_sub1_write)
#  define AO_fetch_and_sub1(addr) \
        AO_fetch_and_sub1_write(addr)
#  define AO_HAVE_fetch_and_sub1
#endif
#if !defined(AO_HAVE_fetch_and_sub1) && \
    defined(AO_HAVE_fetch_and_sub1_read)
#  define AO_fetch_and_sub1(addr) \
        AO_fetch_and_sub1_read(addr)
#  define AO_HAVE_fetch_and_sub1
#endif

#if defined(AO_HAVE_fetch_and_sub1_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_fetch_and_sub1_full)
#  define AO_fetch_and_sub1_full(addr) \
        (AO_nop_full(), AO_fetch_and_sub1_acquire(addr))
#  define AO_HAVE_fetch_and_sub1_full
#endif

#if !defined(AO_HAVE_fetch_and_sub1_release_write) && \
    defined(AO_HAVE_fetch_and_sub1_write)
#  define AO_fetch_and_sub1_release_write(addr) \
        AO_fetch_and_sub1_write(addr)
#  define AO_HAVE_fetch_and_sub1_release_write
#endif
#if !defined(AO_HAVE_fetch_and_sub1_release_write) && \
    defined(AO_HAVE_fetch_and_sub1_release)
#  define AO_fetch_and_sub1_release_write(addr) \
        AO_fetch_and_sub1_release(addr)
#  define AO_HAVE_fetch_and_sub1_release_write
#endif
#if !defined(AO_HAVE_fetch_and_sub1_acquire_read) && \
    defined(AO_HAVE_fetch_and_sub1_read)
#  define AO_fetch_and_sub1_acquire_read(addr) \
        AO_fetch_and_sub1_read(addr)
#  define AO_HAVE_fetch_and_sub1_acquire_read
#endif
#if !defined(AO_HAVE_fetch_and_sub1_acquire_read) && \
    defined(AO_HAVE_fetch_and_sub1_acquire)
#  define AO_fetch_and_sub1_acquire_read(addr) \
        AO_fetch_and_sub1_acquire(addr)
#  define AO_HAVE_fetch_and_sub1_acquire_read
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_fetch_and_sub1_acquire_read)
#    define AO_fetch_and_sub1_dd_acquire_read(addr) \
        AO_fetch_and_sub1_acquire_read(addr)
#    define AO_HAVE_fetch_and_sub1_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_fetch_and_sub1)
#    define AO_fetch_and_sub1_dd_acquire_read(addr) AO_fetch_and_sub1(addr)
#    define AO_HAVE_fetch_and_sub1_dd_acquire_read
#  endif
#endif

/* Atomic or */
#if defined(AO_HAVE_compare_and_swap_full) && \
    !defined(AO_HAVE_or_full)
   AO_INLINE void
   AO_or_full(volatile AO_t *addr, AO_t incr)
   {
     AO_t old;
     do
       {
         old = *addr;
       }
     while (!AO_compare_and_swap_full(addr, old, (old | incr)));
   }
#  define AO_HAVE_or_full
#endif

#if defined(AO_HAVE_or_full)
#  if !defined(AO_HAVE_or_release)
#    define AO_or_release(addr, val) \
         AO_or_full(addr, val)
#    define AO_HAVE_or_release
#  endif
#  if !defined(AO_HAVE_or_acquire)
#    define AO_or_acquire(addr, val) \
         AO_or_full(addr, val)
#    define AO_HAVE_or_acquire
#  endif
#  if !defined(AO_HAVE_or_write)
#    define AO_or_write(addr, val) \
         AO_or_full(addr, val)
#    define AO_HAVE_or_write
#  endif
#  if !defined(AO_HAVE_or_read)
#    define AO_or_read(addr, val) \
         AO_or_full(addr, val)
#    define AO_HAVE_or_read
#  endif
#endif /* AO_HAVE_or_full */

#if !defined(AO_HAVE_or) && \
    defined(AO_HAVE_or_release)
#  define AO_or(addr, val) \
        AO_or_release(addr, val)
#  define AO_HAVE_or
#endif
#if !defined(AO_HAVE_or) && \
    defined(AO_HAVE_or_acquire)
#  define AO_or(addr, val) \
        AO_or_acquire(addr, val)
#  define AO_HAVE_or
#endif
#if !defined(AO_HAVE_or) && \
    defined(AO_HAVE_or_write)
#  define AO_or(addr, val) \
        AO_or_write(addr, val)
#  define AO_HAVE_or
#endif
#if !defined(AO_HAVE_or) && \
    defined(AO_HAVE_or_read)
#  define AO_or(addr, val) \
        AO_or_read(addr, val)
#  define AO_HAVE_or
#endif

#if defined(AO_HAVE_or_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_or_full)
#  define AO_or_full(addr, val) \
        (AO_nop_full(), AO_or_acquire(addr, val))
#endif

#if !defined(AO_HAVE_or_release_write) && \
    defined(AO_HAVE_or_write)
#  define AO_or_release_write(addr, val) \
        AO_or_write(addr, val)
#  define AO_HAVE_or_release_write
#endif
#if !defined(AO_HAVE_or_release_write) && \
    defined(AO_HAVE_or_release)
#  define AO_or_release_write(addr, val) \
        AO_or_release(addr, val)
#  define AO_HAVE_or_release_write
#endif
#if !defined(AO_HAVE_or_acquire_read) && \
    defined(AO_HAVE_or_read)
#  define AO_or_acquire_read(addr, val) \
        AO_or_read(addr, val)
#  define AO_HAVE_or_acquire_read
#endif
#if !defined(AO_HAVE_or_acquire_read) && \
    defined(AO_HAVE_or_acquire)
#  define AO_or_acquire_read(addr, val) \
        AO_or_acquire(addr, val)
#  define AO_HAVE_or_acquire_read
#endif

/* dd_aquire_read is meaningless.       */

/* Test_and_set */

#if defined(AO_HAVE_test_and_set_full)
#  if !defined(AO_HAVE_test_and_set_release)
#    define AO_test_and_set_release(addr) \
         AO_test_and_set_full(addr)
#    define AO_HAVE_test_and_set_release
#  endif
#  if !defined(AO_HAVE_test_and_set_acquire)
#    define AO_test_and_set_acquire(addr) \
         AO_test_and_set_full(addr)
#    define AO_HAVE_test_and_set_acquire
#  endif
#  if !defined(AO_HAVE_test_and_set_write)
#    define AO_test_and_set_write(addr) \
         AO_test_and_set_full(addr)
#    define AO_HAVE_test_and_set_write
#  endif
#  if !defined(AO_HAVE_test_and_set_read)
#    define AO_test_and_set_read(addr) \
         AO_test_and_set_full(addr)
#    define AO_HAVE_test_and_set_read
#  endif
#endif /* AO_HAVE_test_and_set_full */

#if !defined(AO_HAVE_test_and_set) && \
    defined(AO_HAVE_test_and_set_release)
#  define AO_test_and_set(addr) \
        AO_test_and_set_release(addr)
#  define AO_HAVE_test_and_set
#endif
#if !defined(AO_HAVE_test_and_set) && \
    defined(AO_HAVE_test_and_set_acquire)
#  define AO_test_and_set(addr) \
        AO_test_and_set_acquire(addr)
#  define AO_HAVE_test_and_set
#endif
#if !defined(AO_HAVE_test_and_set) && \
    defined(AO_HAVE_test_and_set_write)
#  define AO_test_and_set(addr) \
        AO_test_and_set_write(addr)
#  define AO_HAVE_test_and_set
#endif
#if !defined(AO_HAVE_test_and_set) && \
    defined(AO_HAVE_test_and_set_read)
#  define AO_test_and_set(addr) \
        AO_test_and_set_read(addr)
#  define AO_HAVE_test_and_set
#endif

#if defined(AO_HAVE_test_and_set_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_test_and_set_full)
#  define AO_test_and_set_full(addr) \
        (AO_nop_full(), AO_test_and_set_acquire(addr))
#  define AO_HAVE_test_and_set_full
#endif

#if !defined(AO_HAVE_test_and_set_release_write) && \
    defined(AO_HAVE_test_and_set_write)
#  define AO_test_and_set_release_write(addr) \
        AO_test_and_set_write(addr)
#  define AO_HAVE_test_and_set_release_write
#endif
#if !defined(AO_HAVE_test_and_set_release_write) && \
    defined(AO_HAVE_test_and_set_release)
#  define AO_test_and_set_release_write(addr) \
        AO_test_and_set_release(addr)
#  define AO_HAVE_test_and_set_release_write
#endif
#if !defined(AO_HAVE_test_and_set_acquire_read) && \
    defined(AO_HAVE_test_and_set_read)
#  define AO_test_and_set_acquire_read(addr) \
        AO_test_and_set_read(addr)
#  define AO_HAVE_test_and_set_acquire_read
#endif
#if !defined(AO_HAVE_test_and_set_acquire_read) && \
    defined(AO_HAVE_test_and_set_acquire)
#  define AO_test_and_set_acquire_read(addr) \
        AO_test_and_set_acquire(addr)
#  define AO_HAVE_test_and_set_acquire_read
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_test_and_set_acquire_read)
#    define AO_test_and_set_dd_acquire_read(addr) \
        AO_test_and_set_acquire_read(addr)
#    define AO_HAVE_test_and_set_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_test_and_set)
#    define AO_test_and_set_dd_acquire_read(addr) AO_test_and_set(addr)
#    define AO_HAVE_test_and_set_dd_acquire_read
#  endif
#endif

/* Compare_and_swap */
#if defined(AO_HAVE_compare_and_swap) && defined(AO_HAVE_nop_full)\
    && !defined(AO_HAVE_compare_and_swap_acquire)
   AO_INLINE int
   AO_compare_and_swap_acquire(volatile AO_t *addr, AO_t old, AO_t new_val)
   {
     int result = AO_compare_and_swap(addr, old, new_val);
     AO_nop_full();
     return result;
   }
#  define AO_HAVE_compare_and_swap_acquire
#endif
#if defined(AO_HAVE_compare_and_swap) && defined(AO_HAVE_nop_full)\
    && !defined(AO_HAVE_compare_and_swap_release)
#  define AO_compare_and_swap_release(addr, old, new_val) \
        (AO_nop_full(), AO_compare_and_swap(addr, old, new_val))
#  define AO_HAVE_compare_and_swap_release
#endif
#if defined(AO_HAVE_compare_and_swap_full)
#  if !defined(AO_HAVE_compare_and_swap_release)
#    define AO_compare_and_swap_release(addr, old, new_val) \
         AO_compare_and_swap_full(addr, old, new_val)
#    define AO_HAVE_compare_and_swap_release
#  endif
#  if !defined(AO_HAVE_compare_and_swap_acquire)
#    define AO_compare_and_swap_acquire(addr, old, new_val) \
         AO_compare_and_swap_full(addr, old, new_val)
#    define AO_HAVE_compare_and_swap_acquire
#  endif
#  if !defined(AO_HAVE_compare_and_swap_write)
#    define AO_compare_and_swap_write(addr, old, new_val) \
         AO_compare_and_swap_full(addr, old, new_val)
#    define AO_HAVE_compare_and_swap_write
#  endif
#  if !defined(AO_HAVE_compare_and_swap_read)
#    define AO_compare_and_swap_read(addr, old, new_val) \
         AO_compare_and_swap_full(addr, old, new_val)
#    define AO_HAVE_compare_and_swap_read
#  endif
#endif /* AO_HAVE_compare_and_swap_full */

#if !defined(AO_HAVE_compare_and_swap) && \
    defined(AO_HAVE_compare_and_swap_release)
#  define AO_compare_and_swap(addr, old, new_val) \
        AO_compare_and_swap_release(addr, old, new_val)
#  define AO_HAVE_compare_and_swap
#endif
#if !defined(AO_HAVE_compare_and_swap) && \
    defined(AO_HAVE_compare_and_swap_acquire)
#  define AO_compare_and_swap(addr, old, new_val) \
        AO_compare_and_swap_acquire(addr, old, new_val)
#  define AO_HAVE_compare_and_swap
#endif
#if !defined(AO_HAVE_compare_and_swap) && \
    defined(AO_HAVE_compare_and_swap_write)
#  define AO_compare_and_swap(addr, old, new_val) \
        AO_compare_and_swap_write(addr, old, new_val)
#  define AO_HAVE_compare_and_swap
#endif
#if !defined(AO_HAVE_compare_and_swap) && \
    defined(AO_HAVE_compare_and_swap_read)
#  define AO_compare_and_swap(addr, old, new_val) \
        AO_compare_and_swap_read(addr, old, new_val)
#  define AO_HAVE_compare_and_swap
#endif

#if defined(AO_HAVE_compare_and_swap_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_compare_and_swap_full)
#  define AO_compare_and_swap_full(addr, old, new_val) \
        (AO_nop_full(), AO_compare_and_swap_acquire(addr, old, new_val))
#  define AO_HAVE_compare_and_swap_full
#endif

#if !defined(AO_HAVE_compare_and_swap_release_write) && \
    defined(AO_HAVE_compare_and_swap_write)
#  define AO_compare_and_swap_release_write(addr, old, new_val) \
        AO_compare_and_swap_write(addr, old, new_val)
#  define AO_HAVE_compare_and_swap_release_write
#endif
#if !defined(AO_HAVE_compare_and_swap_release_write) && \
    defined(AO_HAVE_compare_and_swap_release)
#  define AO_compare_and_swap_release_write(addr, old, new_val) \
        AO_compare_and_swap_release(addr, old, new_val)
#  define AO_HAVE_compare_and_swap_release_write
#endif
#if !defined(AO_HAVE_compare_and_swap_acquire_read) && \
    defined(AO_HAVE_compare_and_swap_read)
#  define AO_compare_and_swap_acquire_read(addr, old, new_val) \
        AO_compare_and_swap_read(addr, old, new_val)
#  define AO_HAVE_compare_and_swap_acquire_read
#endif
#if !defined(AO_HAVE_compare_and_swap_acquire_read) && \
    defined(AO_HAVE_compare_and_swap_acquire)
#  define AO_compare_and_swap_acquire_read(addr, old, new_val) \
        AO_compare_and_swap_acquire(addr, old, new_val)
#  define AO_HAVE_compare_and_swap_acquire_read
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_compare_and_swap_acquire_read)
#    define AO_compare_and_swap_dd_acquire_read(addr, old, new_val) \
        AO_compare_and_swap_acquire_read(addr, old, new_val)
#    define AO_HAVE_compare_and_swap_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_compare_and_swap)
#    define AO_compare_and_swap_dd_acquire_read(addr, old, new_val) \
        AO_compare_and_swap(addr, old, new_val)
#    define AO_HAVE_compare_and_swap_dd_acquire_read
#  endif
#endif

#include "generalize-small.h"

/* Compare_double_and_swap_double */
#if defined(AO_HAVE_compare_double_and_swap_double) && defined(AO_HAVE_nop_full)\
    && !defined(AO_HAVE_compare_double_and_swap_double_acquire)
   AO_INLINE int
   AO_compare_double_and_swap_double_acquire(volatile AO_double_t *addr,
                                             AO_t o1, AO_t o2,
                                             AO_t n1, AO_t n2)
   {
     int result = AO_compare_double_and_swap_double(addr, o1, o2, n1, n2);
     AO_nop_full();
     return result;
   }
#  define AO_HAVE_compare_double_and_swap_double_acquire
#endif
#if defined(AO_HAVE_compare_double_and_swap_double) \
    && defined(AO_HAVE_nop_full)\
    && !defined(AO_HAVE_compare_double_and_swap_double_release)
#  define AO_compare_double_and_swap_double_release(addr, o1, o2, n1, n2) \
        (AO_nop_full(), AO_compare_double_and_swap_double(addr, o1, o2, n1, n2))
#  define AO_HAVE_compare_double_and_swap_double_release
#endif
#if defined(AO_HAVE_compare_double_and_swap_double_full)
#  if !defined(AO_HAVE_compare_double_and_swap_double_release)
#    define AO_compare_double_and_swap_double_release(addr, o1, o2, n1, n2) \
         AO_compare_double_and_swap_double_full(addr, o1, o2, n1, n2)
#    define AO_HAVE_compare_double_and_swap_double_release
#  endif
#  if !defined(AO_HAVE_compare_double_and_swap_double_acquire)
#    define AO_compare_double_and_swap_double_acquire(addr, o1, o2, n1, n2) \
         AO_compare_double_and_swap_double_full(addr, o1, o2, n1, n2)
#    define AO_HAVE_compare_double_and_swap_double_acquire
#  endif
#  if !defined(AO_HAVE_compare_double_and_swap_double_write)
#    define AO_compare_double_and_swap_double_write(addr, o1, o2, n1, n2) \
         AO_compare_double_and_swap_double_full(addr, o1, o2, n1, n2)
#    define AO_HAVE_compare_double_and_swap_double_write
#  endif
#  if !defined(AO_HAVE_compare_double_and_swap_double_read)
#    define AO_compare_double_and_swap_double_read(addr, o1, o2, n1, n2) \
         AO_compare_double_and_swap_double_full(addr, o1, o2, n1, n2)
#    define AO_HAVE_compare_double_and_swap_double_read
#  endif
#endif /* AO_HAVE_compare_double_and_swap_double_full */

#if !defined(AO_HAVE_compare_double_and_swap_double) && \
    defined(AO_HAVE_compare_double_and_swap_double_release)
#  define AO_compare_double_and_swap_double(addr, o1, o2, n1, n2) \
        AO_compare_double_and_swap_double_release(addr, o1, o2, n1, n2)
#  define AO_HAVE_compare_double_and_swap_double
#endif
#if !defined(AO_HAVE_compare_double_and_swap_double) && \
    defined(AO_HAVE_compare_double_and_swap_double_acquire)
#  define AO_compare_double_and_swap_double(addr, o1, o2, n1, n2) \
        AO_compare_double_and_swap_double_acquire(addr, o1, o2, n1, n2)
#  define AO_HAVE_compare_double_and_swap_double
#endif
#if !defined(AO_HAVE_compare_double_and_swap_double) && \
    defined(AO_HAVE_compare_double_and_swap_double_write)
#  define AO_compare_double_and_swap_double(addr, o1, o2, n1, n2) \
        AO_compare_double_and_swap_double_write(addr, o1, o2, n1, n2)
#  define AO_HAVE_compare_double_and_swap_double
#endif
#if !defined(AO_HAVE_compare_double_and_swap_double) && \
    defined(AO_HAVE_compare_double_and_swap_double_read)
#  define AO_compare_double_and_swap_double(addr, o1, o2, n1, n2) \
        AO_compare_double_and_swap_double_read(addr, o1, o2, n1, n2)
#  define AO_HAVE_compare_double_and_swap_double
#endif

#if defined(AO_HAVE_compare_double_and_swap_double_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_compare_double_and_swap_double_full)
#  define AO_compare_double_and_swap_double_full(addr, o1, o2, n1, n2) \
        (AO_nop_full(), AO_compare_double_and_swap_double_acquire(addr, o1, o2, n1, n2))
#  define AO_HAVE_compare_double_and_swap_double_full
#endif

#if !defined(AO_HAVE_compare_double_and_swap_double_release_write) && \
    defined(AO_HAVE_compare_double_and_swap_double_write)
#  define AO_compare_double_and_swap_double_release_write(addr, o1, o2, n1, n2) \
        AO_compare_double_and_swap_double_write(addr, o1, o2, n1, n2)
#  define AO_HAVE_compare_double_and_swap_double_release_write
#endif
#if !defined(AO_HAVE_compare_double_and_swap_double_release_write) && \
    defined(AO_HAVE_compare_double_and_swap_double_release)
#  define AO_compare_double_and_swap_double_release_write(addr, o1, o2, n1, n2) \
        AO_compare_double_and_swap_double_release(addr, o1, o2, n1, n2)
#  define AO_HAVE_compare_double_and_swap_double_release_write
#endif
#if !defined(AO_HAVE_compare_double_and_swap_double_acquire_read) && \
    defined(AO_HAVE_compare_double_and_swap_double_read)
#  define AO_compare_double_and_swap_double_acquire_read(addr, o1, o2, n1, n2) \
        AO_compare_double_and_swap_double_read(addr, o1, o2, n1, n2)
#  define AO_HAVE_compare_double_and_swap_double_acquire_read
#endif
#if !defined(AO_HAVE_compare_double_and_swap_double_acquire_read) && \
    defined(AO_HAVE_compare_double_and_swap_double_acquire)
#  define AO_compare_double_and_swap_double_acquire_read(addr, o1, o2, n1, n2) \
        AO_compare_double_and_swap_double_acquire(addr, o1, o2, n1, n2)
#  define AO_HAVE_compare_double_and_swap_double_acquire_read
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_compare_double_and_swap_double_acquire_read)
#    define AO_compare_double_and_swap_double_dd_acquire_read(addr, o1, o2, n1, n2) \
        AO_compare_double_and_swap_double_acquire_read(addr, o1, o2, n1, n2)
#    define AO_HAVE_compare_double_and_swap_double_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_compare_double_and_swap_double)
#    define AO_compare_double_and_swap_double_dd_acquire_read(addr, o1, o2, n1, n2) \
        AO_compare_double_and_swap_double(addr, o1, o2, n1, n2)
#    define AO_HAVE_compare_double_and_swap_double_dd_acquire_read
#  endif
#endif

/* Compare_and_swap_double */
#if defined(AO_HAVE_compare_and_swap_double) && defined(AO_HAVE_nop_full)\
    && !defined(AO_HAVE_compare_and_swap_double_acquire)
   AO_INLINE int
   AO_compare_and_swap_double_acquire(volatile AO_double_t *addr,
                                             AO_t o1,
                                             AO_t n1, AO_t n2)
   {
     int result = AO_compare_and_swap_double(addr, o1, n1, n2);
     AO_nop_full();
     return result;
   }
#  define AO_HAVE_compare_and_swap_double_acquire
#endif
#if defined(AO_HAVE_compare_and_swap_double) \
    && defined(AO_HAVE_nop_full)\
    && !defined(AO_HAVE_compare_and_swap_double_release)
#  define AO_compare_and_swap_double_release(addr, o1, n1, n2) \
        (AO_nop_full(), AO_compare_and_swap_double(addr, o1, n1, n2))
#  define AO_HAVE_compare_and_swap_double_release
#endif
#if defined(AO_HAVE_compare_and_swap_double_full)
#  if !defined(AO_HAVE_compare_and_swap_double_release)
#    define AO_compare_and_swap_double_release(addr, o1, n1, n2) \
         AO_compare_and_swap_double_full(addr, o1, n1, n2)
#    define AO_HAVE_compare_and_swap_double_release
#  endif
#  if !defined(AO_HAVE_compare_and_swap_double_acquire)
#    define AO_compare_and_swap_double_acquire(addr, o1, n1, n2) \
         AO_compare_and_swap_double_full(addr, o1, n1, n2)
#    define AO_HAVE_compare_and_swap_double_acquire
#  endif
#  if !defined(AO_HAVE_compare_and_swap_double_write)
#    define AO_compare_and_swap_double_write(addr, o1, n1, n2) \
         AO_compare_and_swap_double_full(addr, o1, n1, n2)
#    define AO_HAVE_compare_and_swap_double_write
#  endif
#  if !defined(AO_HAVE_compare_and_swap_double_read)
#    define AO_compare_and_swap_double_read(addr, o1, n1, n2) \
         AO_compare_and_swap_double_full(addr, o1, n1, n2)
#    define AO_HAVE_compare_and_swap_double_read
#  endif
#endif /* AO_HAVE_compare_and_swap_double_full */

#if !defined(AO_HAVE_compare_and_swap_double) && \
    defined(AO_HAVE_compare_and_swap_double_release)
#  define AO_compare_and_swap_double(addr, o1, n1, n2) \
        AO_compare_and_swap_double_release(addr, o1, n1, n2)
#  define AO_HAVE_compare_and_swap_double
#endif
#if !defined(AO_HAVE_compare_and_swap_double) && \
    defined(AO_HAVE_compare_and_swap_double_acquire)
#  define AO_compare_and_swap_double(addr, o1, n1, n2) \
        AO_compare_and_swap_double_acquire(addr, o1, n1, n2)
#  define AO_HAVE_compare_and_swap_double
#endif
#if !defined(AO_HAVE_compare_and_swap_double) && \
    defined(AO_HAVE_compare_and_swap_double_write)
#  define AO_compare_and_swap_double(addr, o1, n1, n2) \
        AO_compare_and_swap_double_write(addr, o1, n1, n2)
#  define AO_HAVE_compare_and_swap_double
#endif
#if !defined(AO_HAVE_compare_and_swap_double) && \
    defined(AO_HAVE_compare_and_swap_double_read)
#  define AO_compare_and_swap_double(addr, o1, n1, n2) \
        AO_compare_and_swap_double_read(addr, o1, n1, n2)
#  define AO_HAVE_compare_and_swap_double
#endif

#if defined(AO_HAVE_compare_and_swap_double_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_compare_and_swap_double_full)
#  define AO_compare_and_swap_double_full(addr, o1, n1, n2) \
        (AO_nop_full(), AO_compare_and_swap_double_acquire(addr, o1, n1, n2))
#  define AO_HAVE_compare_and_swap_double_full
#endif

#if !defined(AO_HAVE_compare_and_swap_double_release_write) && \
    defined(AO_HAVE_compare_and_swap_double_write)
#  define AO_compare_and_swap_double_release_write(addr, o1, n1, n2) \
        AO_compare_and_swap_double_write(addr, o1, n1, n2)
#  define AO_HAVE_compare_and_swap_double_release_write
#endif
#if !defined(AO_HAVE_compare_and_swap_double_release_write) && \
    defined(AO_HAVE_compare_and_swap_double_release)
#  define AO_compare_and_swap_double_release_write(addr, o1, n1, n2) \
        AO_compare_and_swap_double_release(addr, o1, n1, n2)
#  define AO_HAVE_compare_and_swap_double_release_write
#endif
#if !defined(AO_HAVE_compare_and_swap_double_acquire_read) && \
    defined(AO_HAVE_compare_and_swap_double_read)
#  define AO_compare_and_swap_double_acquire_read(addr, o1, n1, n2) \
        AO_compare_and_swap_double_read(addr, o1, n1, n2)
#  define AO_HAVE_compare_and_swap_double_acquire_read
#endif
#if !defined(AO_HAVE_compare_and_swap_double_acquire_read) && \
    defined(AO_HAVE_compare_and_swap_double_acquire)
#  define AO_compare_and_swap_double_acquire_read(addr, o1, n1, n2) \
        AO_compare_and_swap_double_acquire(addr, o1, n1, n2)
#  define AO_HAVE_compare_and_swap_double_acquire_read
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_compare_and_swap_double_acquire_read)
#    define AO_compare_and_swap_double_dd_acquire_read(addr, o1, n1, n2) \
        AO_compare_and_swap_double_acquire_read(addr, o1, n1, n2)
#    define AO_HAVE_compare_and_swap_double_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_compare_and_swap_double)
#    define AO_compare_and_swap_double_dd_acquire_read(addr, o1, n1, n2) \
        AO_compare_and_swap_double(addr, o1, n1, n2)
#    define AO_HAVE_compare_and_swap_double_dd_acquire_read
#  endif
#endif

/* NEC LE-IT: Convenience functions for AO_double compare and swap which */
/* types and reads easier in code                                        */
#if defined(AO_HAVE_compare_double_and_swap_double_release) && \
    !defined(AO_HAVE_double_compare_and_swap_release)
AO_INLINE int
AO_double_compare_and_swap_release(volatile AO_double_t *addr,
                                   AO_double_t old_val, AO_double_t new_val)
{
        return AO_compare_double_and_swap_double_release(addr,
                                                         old_val.AO_val1, old_val.AO_val2,
                                                         new_val.AO_val1, new_val.AO_val2);
}
#define AO_HAVE_double_compare_and_swap_release
#endif

#if defined(AO_HAVE_compare_double_and_swap_double_acquire) && \
    !defined(AO_HAVE_double_compare_and_swap_acquire)
AO_INLINE int
AO_double_compare_and_swap_acquire(volatile AO_double_t *addr,
                                   AO_double_t old_val, AO_double_t new_val)
{
        return AO_compare_double_and_swap_double_acquire(addr,
                                                         old_val.AO_val1, old_val.AO_val2,
                                                         new_val.AO_val1, new_val.AO_val2);
}
#define AO_HAVE_double_compare_and_swap_acquire
#endif

#if defined(AO_HAVE_compare_double_and_swap_double_full) && \
    !defined(AO_HAVE_double_compare_and_swap_full)
AO_INLINE int
AO_double_compare_and_swap_full(volatile AO_double_t *addr,
                                         AO_double_t old_val, AO_double_t new_val)
{
        return AO_compare_double_and_swap_double_full(addr,
                                                      old_val.AO_val1, old_val.AO_val2,
                                                      new_val.AO_val1, new_val.AO_val2);
}
#define AO_HAVE_double_compare_and_swap_full
#endif


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/README
================================================
There are two kinds of entities in this directory:

- Subdirectories corresponding to specific compilers (or compiler/OS combinations).
  Each of these includes one or more architecture-specific headers.

- More generic header files corresponding to a particular ordering and/or
  atomicity property that might be shared by multiple hardware platforms.


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/acquire_release_volatile.h
================================================
/*
 * Copyright (c) 2003 Hewlett-Packard Development Company, L.P.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

/*
 * This file adds definitions appropriate for environments in which an AO_t
 * volatile load has acquire semantics, and an AO_t volatile store has release
 * semantics.  This is arguably supposed to be true with the standard Itanium
 * software conventions.
 */

/*
 * Empirically gcc/ia64 does some reordering of ordinary operations around volatiles
 * even when we think it shouldn't.  Gcc 3.3 and earlier could reorder a volatile store
 * with another store.  As of March 2005, gcc pre-4 reused previously computed
 * common subexpressions across a volatile load.
 * Hence we now add compiler barriers for gcc.
 */
#if !defined(AO_GCC_BARRIER)
#  if defined(__GNUC__)
#    define AO_GCC_BARRIER() AO_compiler_barrier()
#  else
#    define AO_GCC_BARRIER()
#  endif
#endif

AO_INLINE AO_t
AO_load_acquire(const volatile AO_t *p)
{
  AO_t result = *p;
  /* A normal volatile load generates an ld.acq         */
  AO_GCC_BARRIER();
  return result;
}
#define AO_HAVE_load_acquire

AO_INLINE void
AO_store_release(volatile AO_t *p, AO_t val)
{
  AO_GCC_BARRIER();
  /* A normal volatile store generates an st.rel        */
  *p = val;
}
#define AO_HAVE_store_release


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/aligned_atomic_load_store.h
================================================
/*
 * Copyright (c) 2003 Hewlett-Packard Development Company, L.P.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

/*
 * Definitions for architectures on which loads and stores of AO_t are
 * atomic fo all legal alignments.
 */

AO_INLINE AO_t
AO_load(const volatile AO_t *addr)
{
  assert(((size_t)addr & (sizeof(AO_t) - 1)) == 0);
  /* Cast away the volatile for architectures where             */
  /* volatile adds barrier semantics.                           */
  return *(AO_t *)addr;
}

#define AO_HAVE_load

AO_INLINE void
AO_store(volatile AO_t *addr, AO_t new_val)
{
  assert(((size_t)addr & (sizeof(AO_t) - 1)) == 0);
  (*(AO_t *)addr) = new_val;
}

#define AO_HAVE_store


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/all_acquire_release_volatile.h
================================================
/*
 * Copyright (c) 2004 Hewlett-Packard Development Company, L.P.
 * 
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE. 
 */

/*
 * Describes architectures on which volatile AO_t, unsigned char, unsigned
 * short, and unsigned int loads and stores have acquire/release semantics for
 * all normally legal alignments.
 */
#include "acquire_release_volatile.h"
#include "char_acquire_release_volatile.h"
#include "short_acquire_release_volatile.h"
#include "int_acquire_release_volatile.h"


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/all_aligned_atomic_load_store.h
================================================
/*
 * Copyright (c) 2004 Hewlett-Packard Development Company, L.P.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

/*
 * Describes architectures on which AO_t, unsigned char, unsigned short,
 * and unsigned int loads and stores are atomic for all normally legal
 * alignments.
 */
#include "aligned_atomic_load_store.h"
#include "char_atomic_load_store.h"
#include "short_aligned_atomic_load_store.h"
#include "int_aligned_atomic_load_store.h"


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/all_atomic_load_store.h
================================================
/*
 * Copyright (c) 2004 Hewlett-Packard Development Company, L.P.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

/*
 * Describes architectures on which AO_t, unsigned char, unsigned short,
 * and unsigned int loads and stores are atomic for all normally legal
 * alignments.
 */
#include "atomic_load_store.h"
#include "char_atomic_load_store.h"
#include "short_atomic_load_store.h"
#include "int_atomic_load_store.h"


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/ao_t_is_int.h
================================================
/*
 * Copyright (c) 2003-2004 Hewlett-Packard Development Company, L.P.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

/*
 * Inclusion of this file signifies that AO_t is in fact int.  Hence
 * any AO_... operations can also server as AO_int_... operations.
 * We currently define only the more important ones here, and allow for
 * the normal generalization process to define the others.
 * We should probably add others in the future.
 */

#if defined(AO_HAVE_compare_and_swap_full) && \
    !defined(AO_HAVE_int_compare_and_swap_full)
#  define AO_int_compare_and_swap_full(addr, old, new_val) \
                AO_compare_and_swap_full((volatile AO_t *)(addr), \
                                        (AO_t)(old), (AO_t)(new_val))
#  define AO_HAVE_int_compare_and_swap_full
# endif

#if defined(AO_HAVE_compare_and_swap_acquire) && \
    !defined(AO_HAVE_int_compare_and_swap_acquire)
#  define AO_int_compare_and_swap_acquire(addr, old, new_val) \
                AO_compare_and_swap_acquire((volatile AO_t *)(addr), \
                                            (AO_t)(old), (AO_t)(new_val))
#  define AO_HAVE_int_compare_and_swap_acquire
# endif

#if defined(AO_HAVE_compare_and_swap_release) && \
    !defined(AO_HAVE_int_compare_and_swap_release)
#  define AO_int_compare_and_swap_release(addr, old, new_val) \
                AO_compare_and_swap_release((volatile AO_t *)(addr), \
                                         (AO_t)(old), (AO_t)(new_val))
#  define AO_HAVE_int_compare_and_swap_release
# endif

#if defined(AO_HAVE_compare_and_swap_write) && \
    !defined(AO_HAVE_int_compare_and_swap_write)
#  define AO_int_compare_and_swap_write(addr, old, new_val) \
                AO_compare_and_swap_write((volatile AO_t *)(addr), \
                                          (AO_t)(old), (AO_t)(new_val))
#  define AO_HAVE_int_compare_and_swap_write
# endif

#if defined(AO_HAVE_compare_and_swap_read) && \
    !defined(AO_HAVE_int_compare_and_swap_read)
#  define AO_int_compare_and_swap_read(addr, old, new_val) \
                AO_compare_and_swap_read((volatile AO_t *)(addr), \
                                         (AO_t)(old), (AO_t)(new_val))
#  define AO_HAVE_int_compare_and_swap_read
# endif

#if defined(AO_HAVE_compare_and_swap) && \
    !defined(AO_HAVE_int_compare_and_swap)
#  define AO_int_compare_and_swap(addr, old, new_val) \
                AO_compare_and_swap((volatile AO_t *)(addr), \
                                    (AO_t)(old), (AO_t)(new_val))
#  define AO_HAVE_int_compare_and_swap
# endif

#if defined(AO_HAVE_load_acquire) && \
    !defined(AO_HAVE_int_load_acquire)
#  define AO_int_load_acquire(addr) \
        (int)AO_load_acquire((const volatile AO_t *)(addr))
#  define AO_HAVE_int_load_acquire
# endif

#if defined(AO_HAVE_store_release) && \
    !defined(AO_HAVE_int_store_release)
#  define AO_int_store_release(addr, val) \
        AO_store_release((volatile AO_t *)(addr), (AO_t)(val))
#  define AO_HAVE_int_store_release
# endif

#if defined(AO_HAVE_fetch_and_add_full) && \
    !defined(AO_HAVE_int_fetch_and_add_full)
#  define AO_int_fetch_and_add_full(addr, incr) \
        (int)AO_fetch_and_add_full((volatile AO_t *)(addr), (AO_t)(incr))
#  define AO_HAVE_int_fetch_and_add_full
# endif

#if defined(AO_HAVE_fetch_and_add1_acquire) && \
    !defined(AO_HAVE_int_fetch_and_add1_acquire)
#  define AO_int_fetch_and_add1_acquire(addr) \
        (int)AO_fetch_and_add1_acquire((volatile AO_t *)(addr))
#  define AO_HAVE_int_fetch_and_add1_acquire
# endif

#if defined(AO_HAVE_fetch_and_add1_release) && \
    !defined(AO_HAVE_int_fetch_and_add1_release)
#  define AO_int_fetch_and_add1_release(addr) \
        (int)AO_fetch_and_add1_release((volatile AO_t *)(addr))
#  define AO_HAVE_int_fetch_and_add1_release
# endif

#if defined(AO_HAVE_fetch_and_sub1_acquire) && \
    !defined(AO_HAVE_int_fetch_and_sub1_acquire)
#  define AO_int_fetch_and_sub1_acquire(addr) \
        (int)AO_fetch_and_sub1_acquire((volatile AO_t *)(addr))
#  define AO_HAVE_int_fetch_and_sub1_acquire
# endif

#if defined(AO_HAVE_fetch_and_sub1_release) && \
    !defined(AO_HAVE_int_fetch_and_sub1_release)
#  define AO_int_fetch_and_sub1_release(addr) \
        (int)AO_fetch_and_sub1_release((volatile AO_t *)(addr))
#  define AO_HAVE_int_fetch_and_sub1_release
# endif


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/armcc/arm_v6.h
================================================
/*
 * Copyright (c) 2007 by NEC LE-IT:               All rights reserved.
 * A transcription of ARMv6 atomic operations for the ARM Realview Toolchain.
 * This code works with armcc from RVDS 3.1
 * This is based on work in gcc/arm.h by
 *   Copyright (c) 1991-1994 by Xerox Corporation.  All rights reserved.
 *   Copyright (c) 1996-1999 by Silicon Graphics.  All rights reserved.
 *   Copyright (c) 1999-2003 by Hewlett-Packard Company. All rights reserved.
 *
 *
 *
 * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
 * OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
 *
 * Permission is hereby granted to use or copy this program
 * for any purpose,  provided the above notices are retained on all copies.
 * Permission to modify the code and to distribute modified code is granted,
 * provided the above notices are retained, and a notice that the code was
 * modified is included with the above copyright notice.
 *
 */
#include "../read_ordered.h"
#include "../test_and_set_t_is_ao_t.h" /* Probably suboptimal */

#if __TARGET_ARCH_ARM < 6
Dont use with ARM instruction sets lower than v6
#else

#include "../standard_ao_double_t.h"

/* NEC LE-IT: ARMv6 is the first architecture providing support for simple LL/SC
 * A data memory barrier must be raised via CP15 command (see documentation).
 *
 * ARMv7 is compatible to ARMv6 but has a simpler command for issuing a
 * memory barrier (DMB). Raising it via CP15 should still work as told me by the
 * support engineers. If it turns out to be much quicker than we should implement
 * custom code for ARMv7 using the asm { dmb } command.
 *
 * If only a single processor is used, we can define AO_UNIPROCESSOR
 * and do not need to access CP15 for ensuring a DMB at all.
*/

AO_INLINE void
AO_nop_full(void)
{
#ifndef AO_UNIPROCESSOR
        unsigned int dest=0;
        /* issue an data memory barrier (keeps ordering of memory transactions  */
        /* before and after this operation)                                             */
        __asm { mcr p15,0,dest,c7,c10,5 } ;
#endif
}

#define AO_HAVE_nop_full

AO_INLINE AO_t
AO_load(const volatile AO_t *addr)
{
        /* Cast away the volatile in case it adds fence semantics */
        return (*(const AO_t *)addr);
}
#define AO_HAVE_load

/* NEC LE-IT: atomic "store" - according to ARM documentation this is
 * the only safe way to set variables also used in LL/SC environment.
 * A direct write won't be recognized by the LL/SC construct in other CPUs.
 *
 * HB: Based on subsequent discussion, I think it would be OK to use an
 * ordinary store here if we knew that interrupt handlers always cleared
 * the reservation.  They should, but there is some doubt that this is
 * currently always the case for e.g. Linux.
*/
AO_INLINE void AO_store(volatile AO_t *addr, AO_t value)
{
        unsigned long tmp;

retry:
__asm {
                ldrex   tmp, [addr]
                strex   tmp, value, [addr]
                teq     tmp, #0
                bne     retry
          };
}
#define AO_HAVE_store

/* NEC LE-IT: replace the SWAP as recommended by ARM:

   "Applies to: ARM11 Cores
        Though the SWP instruction will still work with ARM V6 cores, it is recommended
        to use the new V6 synchronization instructions. The SWP instruction produces
        locked read and write accesses which are atomic, i.e. another operation cannot
        be done between these locked accesses which ties up external bus (AHB,AXI)
        bandwidth and can increase worst case interrupt latencies. LDREX,STREX are
        more flexible, other instructions can be done between the LDREX and STREX accesses.
   "
*/
AO_INLINE AO_TS_t
AO_test_and_set(volatile AO_TS_t *addr) {

        AO_TS_t oldval;
        unsigned long tmp;
        unsigned long one = 1;
retry:
__asm {
                ldrex   oldval, [addr]
                strex   tmp, one, [addr]
                teq             tmp, #0
                bne     retry
          }

        return oldval;
}

#define AO_HAVE_test_and_set

/* NEC LE-IT: fetch and add for ARMv6 */
AO_INLINE AO_t
AO_fetch_and_add(volatile AO_t *p, AO_t incr)
{
        unsigned long tmp,tmp2;
        AO_t result;

retry:
__asm {
        ldrex   result, [p]
        add     tmp, incr, result
        strex   tmp2, tmp, [p]
        teq     tmp2, #0
        bne     retry }

        return result;
}

#define AO_HAVE_fetch_and_add

/* NEC LE-IT: fetch and add1 for ARMv6 */
AO_INLINE AO_t
AO_fetch_and_add1(volatile AO_t *p)
{
        unsigned long tmp,tmp2;
        AO_t result;

retry:
__asm {
        ldrex   result, [p]
        add     tmp, result, #1
        strex   tmp2, tmp, [p]
        teq             tmp2, #0
        bne     retry
        }

        return result;
}

#define AO_HAVE_fetch_and_add1

/* NEC LE-IT: fetch and sub for ARMv6 */
AO_INLINE AO_t
AO_fetch_and_sub1(volatile AO_t *p)
{
        unsigned long tmp,tmp2;
        AO_t result;

retry:
__asm {
        ldrex   result, [p]
        sub     tmp, result, #1
        strex   tmp2, tmp, [p]
        teq             tmp2, #0
        bne     retry
        }

        return result;
}

#define AO_HAVE_fetch_and_sub1

/* NEC LE-IT: compare and swap */
/* Returns nonzero if the comparison succeeded. */
AO_INLINE int
AO_compare_and_swap(volatile AO_t *addr,
                                AO_t old_val, AO_t new_val)
{
         AO_t result,tmp;

retry:
__asm__ {
        mov             result, #2
        ldrex   tmp, [addr]
        teq             tmp, old_val
        it              eq
        strexeq result, new_val, [addr]
        teq             result, #1
        beq             retry
        }

        return !(result&2);
}
#define AO_HAVE_compare_and_swap

/* helper functions for the Realview compiler: LDREXD is not usable
 * with inline assembler, so use the "embedded" assembler as
 * suggested by ARM Dev. support (June 2008). */
__asm inline double_ptr_storage load_ex(volatile AO_double_t *addr) {
        LDREXD r0,r1,[r0]
}

__asm inline int store_ex(AO_t val1, AO_t val2, volatile AO_double_t *addr) {
        STREXD r3,r0,r1,[r2]
        MOV        r0,r3
}

AO_INLINE int
AO_compare_double_and_swap_double(volatile AO_double_t *addr,
                                                          AO_t old_val1, AO_t old_val2,
                                                          AO_t new_val1, AO_t new_val2)
{
        double_ptr_storage old_val = ((double_ptr_storage)old_val2 << 32) | old_val1;

    double_ptr_storage tmp;
        int result;

        while(1) {
                tmp = load_ex(addr);
                if(tmp != old_val)      return 0;
                result = store_ex(new_val1, new_val2, addr);
                if(!result)     return 1;
        }
}

#define AO_HAVE_compare_double_and_swap_double


#endif // __TARGET_ARCH_ARM


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/atomic_load_store.h
================================================
/*
 * Copyright (c) 2003 Hewlett-Packard Development Company, L.P.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

/*
 * Definitions for architectures on which loads and stores of AO_t are
 * atomic for all legal alignments.
 */

AO_INLINE AO_t
AO_load(const volatile AO_t *addr)
{
  /* Cast away the volatile for architectures like IA64 where   */
  /* volatile adds barrier semantics.                           */
  return (*(const AO_t *)addr);
}

#define AO_HAVE_load

AO_INLINE void
AO_store(volatile AO_t *addr, AO_t new_val)
{
  (*(AO_t *)addr) = new_val;
}

#define AO_HAVE_store


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/char_acquire_release_volatile.h
================================================
/*
 * Copyright (c) 2003-2004 Hewlett-Packard Development Company, L.P.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

/*
 * This file adds definitions appropriate for environments in which an unsigned char
 * volatile load has acquire semantics, and an unsigned char volatile store has release
 * semantics.  This is true with the standard Itanium ABI.
 */
#if !defined(AO_GCC_BARRIER)
#  if defined(__GNUC__)
#    define AO_GCC_BARRIER() AO_compiler_barrier()
#  else
#    define AO_GCC_BARRIER()
#  endif
#endif

AO_INLINE unsigned char
AO_char_load_acquire(const volatile unsigned char *p)
{
  unsigned char result = *p;
  /* A normal volatile load generates an ld.acq         */
  AO_GCC_BARRIER();
  return result;
}
#define AO_HAVE_char_load_acquire

AO_INLINE void
AO_char_store_release(volatile unsigned char *p, unsigned char val)
{
  AO_GCC_BARRIER();
  /* A normal volatile store generates an st.rel        */
  *p = val;
}
#define AO_HAVE_char_store_release


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/char_atomic_load_store.h
================================================
/*
 * Copyright (c) 2003 by Hewlett-Packard Company.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

/*
 * Definitions for architectures on which loads and stores of unsigned char are
 * atomic for all legal alignments.
 */

AO_INLINE unsigned char
AO_char_load(const volatile unsigned char *addr)
{
  /* Cast away the volatile for architectures like IA64 where   */
  /* volatile adds barrier semantics.                           */
  return (*(const unsigned char *)addr);
}

#define AO_HAVE_char_load

AO_INLINE void
AO_char_store(volatile unsigned char *addr, unsigned char new_val)
{
  (*(unsigned char *)addr) = new_val;
}

#define AO_HAVE_char_store


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/emul_cas.h
================================================
/*
 * Copyright (c) 2003 by Hewlett-Packard Company.  All rights reserved.
 * 
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE. 
 */

/*
 * Ensure, if at all possible, that AO_compare_and_swap_full() is
 * available.  The emulation should be brute-force signal-safe, even
 * though it actually blocks.
 * Including this file will generate an error if AO_compare_and_swap_full()
 * cannot be made available.
 * This will be included from platform-specific atomic_ops files
 * id appropriate, and if AO_FORCE_CAS is defined.  It should not be
 * included directly, especially since it affects the implementation
 * of other atomic update primitives.
 * The implementation assumes that only AO_store_XXX and AO_test_and_set_XXX
 * variants are defined, and that AO_test_and_set_XXX is not used to
 * operate on compare_and_swap locations.
 */

#if !defined(ATOMIC_OPS_H)
#  error This file should not be included directly.
#endif

#ifndef AO_HAVE_double_t
# include "standard_ao_double_t.h"
#endif

int AO_compare_and_swap_emulation(volatile AO_t *addr, AO_t old,
				  AO_t new_val);

int AO_compare_double_and_swap_double_emulation(volatile AO_double_t *addr,
						AO_t old_val1, AO_t old_val2,
				                AO_t new_val1, AO_t new_val2);

void AO_store_full_emulation(volatile AO_t *addr, AO_t val);

#define AO_compare_and_swap_full(addr, old, newval) \
	AO_compare_and_swap_emulation(addr, old, newval)
#define AO_HAVE_compare_and_swap_full

#ifndef AO_HAVE_compare_double_and_swap_double
# define AO_compare_double_and_swap_double_full(addr, old1, old2, \
						newval1, newval2) \
	 AO_compare_double_and_swap_double_emulation(addr, old1, old2, \
			 			     newval1, newval2)
# define AO_HAVE_compare_double_and_swap_double_full
#endif

#undef AO_store
#undef AO_HAVE_store
#undef AO_store_write
#undef AO_HAVE_store_write
#undef AO_store_release
#undef AO_HAVE_store_release
#undef AO_store_full
#undef AO_HAVE_store_full
#define AO_store_full(addr, val) AO_store_full_emulation(addr, val)
#define AO_HAVE_store_full


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/gcc/alpha.h
================================================
/*
 * Copyright (c) 1991-1994 by Xerox Corporation.  All rights reserved.
 * Copyright (c) 1996-1999 by Silicon Graphics.  All rights reserved.
 * Copyright (c) 1999-2003 by Hewlett-Packard Company. All rights reserved.
 *
 *
 * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
 * OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
 *
 * Permission is hereby granted to use or copy this program
 * for any purpose,  provided the above notices are retained on all copies.
 * Permission to modify the code and to distribute modified code is granted,
 * provided the above notices are retained, and a notice that the code was
 * modified is included with the above copyright notice.
 *
 */

#include "../atomic_load_store.h"

#include "../test_and_set_t_is_ao_t.h"

#define AO_NO_DD_ORDERING
        /* Data dependence does not imply read ordering.        */

AO_INLINE void
AO_nop_full(void)
{
  __asm__ __volatile__("mb" : : : "memory");
}

#define AO_HAVE_nop_full

AO_INLINE void
AO_nop_write(void)
{
  __asm__ __volatile__("wmb" : : : "memory");
}

#define AO_HAVE_nop_write

/* mb should be used for AO_nop_read().  That's the default.    */

/* We believe that ldq_l ... stq_c does not imply any memory barrier.   */
/* We should add an explicit fetch_and_add definition.                  */
AO_INLINE int
AO_compare_and_swap(volatile AO_t *addr,
                    AO_t old, AO_t new_val)
{
  unsigned long was_equal;
  unsigned long temp;

  __asm__ __volatile__(
                     "1:     ldq_l %0,%1\n"
                     "       cmpeq %0,%4,%2\n"
                     "       mov %3,%0\n"
                     "       beq %2,2f\n"
                     "       stq_c %0,%1\n"
                     "       beq %0,1b\n"
                     "2:\n"
                     :"=&r" (temp), "=m" (*addr), "=&r" (was_equal)
                     : "r" (new_val), "Ir" (old)
                     :"memory");
  return was_equal;
}

#define AO_HAVE_compare_and_swap


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/gcc/arm.h
================================================
/*
 * Copyright (c) 1991-1994 by Xerox Corporation.  All rights reserved.
 * Copyright (c) 1996-1999 by Silicon Graphics.  All rights reserved.
 * Copyright (c) 1999-2003 by Hewlett-Packard Company. All rights reserved.
 *
 *
 * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
 * OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
 *
 * Permission is hereby granted to use or copy this program
 * for any purpose,  provided the above notices are retained on all copies.
 * Permission to modify the code and to distribute modified code is granted,
 * provided the above notices are retained, and a notice that the code was
 * modified is included with the above copyright notice.
 *
 */

#include "../read_ordered.h"

#include "../test_and_set_t_is_ao_t.h" /* Probably suboptimal */

/* NEC LE-IT: ARMv6 is the first architecture providing support for simple LL/SC
 * A data memory barrier must be raised via CP15 command (see documentation).
 *
 * ARMv7 is compatible to ARMv6 but has a simpler command for issuing a
 * memory barrier (DMB). Raising it via CP15 should still work as told me by the
 * support engineers. If it turns out to be much quicker than we should implement
 * custom code for ARMv7 using the asm { dmb } command.
 *
 * If only a single processor is used, we can define AO_UNIPROCESSOR
 * and do not need to access CP15 for ensuring a DMB
*/

/* NEC LE-IT: gcc has no way to easily check the arm architecture
 * but defines only one of __ARM_ARCH_x__ to be true                    */
#if defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \
        || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6ZK__) \
        || defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \
        || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7R__)

#include "../standard_ao_double_t.h"

AO_INLINE void
AO_nop_full(void)
{
#ifndef AO_UNIPROCESSOR
        /* issue an data memory barrier (keeps ordering of memory transactions  */
        /* before and after this operation)                                     */
        unsigned int arg=0;
        __asm__ __volatile__("mcr p15,0,%0,c7,c10,5" : : "r" (arg) : "memory");
#endif
}

#define AO_HAVE_nop_full

/* NEC LE-IT: AO_t load is simple reading */
AO_INLINE AO_t
AO_load(const volatile AO_t *addr)
{
  /* Cast away the volatile for architectures like IA64 where   */
  /* volatile adds barrier semantics.                           */
  return (*(const AO_t *)addr);
}
#define AO_HAVE_load

/* NEC LE-IT: atomic "store" - according to ARM documentation this is
 * the only safe way to set variables also used in LL/SC environment.
 * A direct write won't be recognized by the LL/SC construct on the _same_ CPU.
 * Support engineers response for behaviour of ARMv6:
 *
   Core1        Core2          SUCCESS
   ===================================
   LDREX(x)
   STREX(x)                    Yes
   -----------------------------------
   LDREX(x)
                STR(x)
   STREX(x)                    No
   -----------------------------------
   LDREX(x)
   STR(x)
   STREX(x)                    Yes
   -----------------------------------

 * ARMv7 behaves similar, see documentation CortexA8 TRM, point 8.5
 *
 * HB: I think this is only a problem if interrupt handlers do not clear
 * the reservation, as they almost certainly should.  Probably change this back
 * in a while?
*/
AO_INLINE void AO_store(volatile AO_t *addr, AO_t value)
{
        AO_t    flag;

        __asm__ __volatile__("@AO_store\n"
"1:     ldrex   %0, [%2]\n"
"       strex   %0, %3, [%2]\n"
"       teq     %0, #0\n"
"       bne     1b"
        : "=&r"(flag), "+m"(*addr)
        : "r" (addr), "r"(value)
        : "cc");
}
#define AO_HAVE_store

/* NEC LE-IT: replace the SWAP as recommended by ARM:

   "Applies to: ARM11 Cores
        Though the SWP instruction will still work with ARM V6 cores, it is
        recommended     to use the new V6 synchronization instructions. The SWP
        instruction produces 'locked' read and write accesses which are atomic,
        i.e. another operation cannot be done between these locked accesses which
        ties up external bus (AHB,AXI) bandwidth and can increase worst case
        interrupt latencies. LDREX,STREX are more flexible, other instructions can
        be done between the LDREX and STREX accesses.
   "
*/
AO_INLINE AO_TS_t
AO_test_and_set(volatile AO_TS_t *addr)
{

        AO_TS_t oldval;
        unsigned long flag;

        __asm__ __volatile__("@AO_test_and_set\n"
"1:     ldrex   %0, [%3]\n"
"       strex   %1, %4, [%3]\n"
"       teq             %1, #0\n"
"       bne             1b\n"
        : "=&r"(oldval),"=&r"(flag), "+m"(*addr)
        : "r"(addr), "r"(1)
        : "cc");

        return oldval;
}

#define AO_HAVE_test_and_set

/* NEC LE-IT: fetch and add for ARMv6 */
AO_INLINE AO_t
AO_fetch_and_add(volatile AO_t *p, AO_t incr)
{
        unsigned long flag,tmp;
        AO_t result;

        __asm__ __volatile__("@AO_fetch_and_add\n"
"1:     ldrex   %0, [%5]\n"             /* get original         */
"       add     %2, %0, %4\n"           /* sum up in incr       */
"       strex   %1, %2, [%5]\n"         /* store them           */
"       teq             %1, #0\n"
"       bne             1b\n"
        : "=&r"(result),"=&r"(flag),"=&r"(tmp),"+m"(*p) /* 0..3 */
        : "r"(incr), "r"(p)                                                             /* 4..5 */
        : "cc");

        return result;
}

#define AO_HAVE_fetch_and_add

/* NEC LE-IT: fetch and add1 for ARMv6 */
AO_INLINE AO_t
AO_fetch_and_add1(volatile AO_t *p)
{
        unsigned long flag,tmp;
        AO_t result;

        __asm__ __volatile__("@AO_fetch_and_add1\n"
"1:     ldrex   %0, [%4]\n"             /* get original */
"       add     %1, %0, #1\n"           /* increment */
"       strex   %2, %1, [%4]\n"         /* store them */
"       teq             %2, #0\n"
"       bne             1b\n"
        : "=&r"(result), "=&r"(tmp), "=&r"(flag), "+m"(*p)
        : "r"(p)
        : "cc");

        return result;
}

#define AO_HAVE_fetch_and_add1

/* NEC LE-IT: fetch and sub for ARMv6 */
AO_INLINE AO_t
AO_fetch_and_sub1(volatile AO_t *p)
{
        unsigned long flag,tmp;
        AO_t result;

        __asm__ __volatile__("@AO_fetch_and_sub1\n"
"1:     ldrex   %0, [%4]\n"             /* get original */
"       sub     %1, %0, #1\n"           /* decrement */
"       strex   %2, %1, [%4]\n"         /* store them */
"       teq             %2, #0\n"
"       bne             1b\n"
        : "=&r"(result), "=&r"(tmp), "=&r"(flag), "+m"(*p)
        : "r"(p)
        : "cc");

        return result;
}

#define AO_HAVE_fetch_and_sub1

/* NEC LE-IT: compare and swap */
/* Returns nonzero if the comparison succeeded. */
AO_INLINE int
AO_compare_and_swap(volatile AO_t *addr,
                                AO_t old_val, AO_t new_val)
{
         AO_t result,tmp;

        __asm__ __volatile__("@ AO_compare_and_swap\n"
"1:     mov             %0, #2\n"       /* store a flag */
"       ldrex   %1, [%3]\n"             /* get original */
"       teq             %1, %4\n"       /* see if match */
"       it              eq\n"
"       strexeq %0, %5, [%3]\n"         /* store new one if matched */
"       teq             %0, #1\n"
"       beq             1b\n"           /* if update failed, repeat */
        : "=&r"(result), "=&r"(tmp), "+m"(*addr)
        : "r"(addr), "r"(old_val), "r"(new_val)
        : "cc");

        return !(result&2);             /* if succeded, return 1, else 0 */
}
#define AO_HAVE_compare_and_swap

AO_INLINE int
AO_compare_double_and_swap_double(volatile AO_double_t *addr,
                                  AO_t old_val1, AO_t old_val2,
                                  AO_t new_val1, AO_t new_val2)
{
        double_ptr_storage old_val = ((double_ptr_storage)old_val2 << 32) | old_val1;
        double_ptr_storage new_val = ((double_ptr_storage)new_val2 << 32) | new_val1;

    double_ptr_storage tmp;
        int result;

        while(1) {
                __asm__ __volatile__("@ AO_compare_and_swap_double\n"
                "       ldrexd  %0, [%1]\n" /* get original to r1 & r2 */
                        : "=&r"(tmp)
                        : "r"(addr)
                        : "cc");
                if(tmp != old_val)      return 0;
                __asm__ __volatile__(
                "       strexd  %0, %2, [%3]\n" /* store new one if matched */
                        : "=&r"(result),"+m"(*addr)
                        : "r"(new_val), "r"(addr)
                        : "cc");
                if(!result)     return 1;
        }
}

#define AO_HAVE_compare_double_and_swap_double

#else
/* pre ARMv6 architectures ... */

/* I found a slide set that, if I read it correctly, claims that        */
/* Loads followed by either a Load or Store are ordered, but nothing    */
/* else is.                                                             */
/* It appears that SWP is the only simple memory barrier.               */
#include "../all_atomic_load_store.h"

AO_INLINE AO_TS_VAL_t
AO_test_and_set_full(volatile AO_TS_t *addr)
{
  AO_TS_VAL_t oldval;
  /* SWP on ARM is very similar to XCHG on x86.                 */
  /* The first operand is the result, the second the value      */
  /* to be stored.  Both registers must be different from addr. */
  /* Make the address operand an early clobber output so it     */
  /* doesn't overlap with the other operands.  The early clobber*/
  /* on oldval is necessary to prevent the compiler allocating  */
  /* them to the same register if they are both unused.         */
  __asm__ __volatile__("swp %0, %2, [%3]"
                        : "=&r"(oldval), "=&r"(addr)
                        : "r"(1), "1"(addr)
                        : "memory");
  return oldval;
}

#define AO_HAVE_test_and_set_full

#endif /* __ARM_ARCH_x */


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/gcc/avr32.h
================================================
/*
 * Copyright (C) 2009 Bradley Smith <brad@brad-smith.co.uk>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be included
 * in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 */

#include "../all_atomic_load_store.h"

#include "../ordered.h" /* There are no multiprocessor implementations. */

#include "../test_and_set_t_is_ao_t.h"

AO_INLINE AO_TS_VAL_t
AO_test_and_set_full(volatile AO_TS_t *addr)
{
        register long ret;

        __asm__ __volatile__(
                "xchg %[oldval], %[mem], %[newval]"
                : [oldval] "=&r"(ret)
                : [mem] "r"(addr), [newval] "r"(1)
                : "memory");

        return (AO_TS_VAL_t)ret;
}
#define AO_HAVE_test_and_set_full

AO_INLINE int
AO_compare_and_swap_full(volatile AO_t *addr, AO_t old, AO_t new_val)
{
       register long ret;

       __asm__ __volatile__(
               "1: ssrf    5\n"
               "   ld.w    %[res], %[mem]\n"
               "   eor     %[res], %[oldval]\n"
               "   brne    2f\n"
               "   stcond  %[mem], %[newval]\n"
               "   brne    1b\n"
               "2:\n"
               : [res] "=&r"(ret), [mem] "=m"(*addr)
               : "m"(*addr), [newval] "r"(new_val), [oldval] "r"(old)
               : "cc", "memory");

       return (int)ret;
}
#define AO_HAVE_compare_and_swap_full


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/gcc/cris.h
================================================
/*
 * Copyright (c) 2004 Hewlett-Packard Development Company, L.P.
 * 
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE. 
 *
 * Most of this code originally comes from Hans-Peter Nilsson.  It is included
 * here with his permission.
 *
 * This version has not been tested.  It was coped here from a GC
 * patch so that we wouldn't lose the code in the upgrade to gc7.
 */ 

#include "../all_atomic_load_store.h"

#include "../ordered.h"  /* There are no multiprocessor implementations. */

#include "../test_and_set_t_is_ao_t.h"

/*
 * The architecture apparently supports an "f" flag which is
 * set on preemption.  This essentially gives us load-locked,
 * store-conditional primitives, though I'm not quite sure how
 * this would work on a hypothetical multiprocessor.  -HB
 *
 * For details, see
 * http://developer.axis.com/doc/hardware/etrax100lx/prog_man/
 *      1_architectural_description.pdf
 *
 * Presumably many other primitives (notably CAS, including the double-
 * width versions) could be implemented in this manner, if someone got
 * around to it.
 */

AO_INLINE AO_TS_VAL_t
AO_test_and_set_full(volatile AO_TS_t *addr) {
    /* Ripped from linuxthreads/sysdeps/cris/pt-machine.h */
    register unsigned long int ret;

    /* Note the use of a dummy output of *addr to expose the write.  The
       memory barrier is to stop *other* writes being moved past this code.  */
      __asm__ __volatile__("clearf\n"
        		   "0:\n\t"
                    	   "movu.b [%2],%0\n\t"
                    	   "ax\n\t"
                    	   "move.b %3,[%2]\n\t"
                    	   "bwf 0b\n\t"
                    	   "clearf"
                    	   : "=&r" (ret), "=m" (*addr)
                    	   : "r" (addr), "r" ((int) 1), "m" (*addr)
                    	   : "memory");
    return ret;
}

#define AO_HAVE_test_and_set_full


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/gcc/hppa.h
================================================
/*
 * Copyright (c) 2003 Hewlett-Packard Development Company, L.P.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 * Modified by Carlos O'Donell <carlos@baldric.uwo.ca>, 2003
 *      - Added self-aligning lock.
 *
 */

#include "../all_atomic_load_store.h"

/* Some architecture set descriptions include special "ordered" memory  */
/* operations.  As far as we can tell, no existing processors actually  */
/* require those.  Nor does it appear likely that future processors     */
/* will.                                                                */
#include "../ordered.h"

/* GCC will not guarantee the alignment we need, use four lock words    */
/* and select the correctly aligned datum. See the glibc 2.3.2          */
/* linuxthread port for the original implementation.                    */
struct AO_pa_clearable_loc {
  int data[4];
};

#undef AO_TS_INITIALIZER
#define AO_TS_t struct AO_pa_clearable_loc
#define AO_TS_INITIALIZER {1,1,1,1}
/* Switch meaning of set and clear, since we only have an atomic clear  */
/* instruction.                                                         */
typedef enum {AO_PA_TS_set = 0, AO_PA_TS_clear = 1} AO_PA_TS_val;
#define AO_TS_VAL_t AO_PA_TS_val
#define AO_TS_CLEAR AO_PA_TS_clear
#define AO_TS_SET AO_PA_TS_set

/* The hppa only has one atomic read and modify memory operation,       */
/* load and clear, so hppa spinlocks must use zero to signify that      */
/* someone is holding the lock.  The address used for the ldcw          */
/* semaphore must be 16-byte aligned.                                   */

#define __ldcw(a) ({ \
  volatile unsigned int __ret;                                  \
  __asm__ __volatile__("ldcw 0(%2),%0"                          \
                      : "=r" (__ret), "=m" (*(a)) : "r" (a));   \
  __ret;                                                        \
})

/* Because malloc only guarantees 8-byte alignment for malloc'd data,   */
/* and GCC only guarantees 8-byte alignment for stack locals, we can't  */
/* be assured of 16-byte alignment for atomic lock data even if we      */
/* specify "__attribute ((aligned(16)))" in the type declaration.  So,  */
/* we use a struct containing an array of four ints for the atomic lock */
/* type and dynamically select the 16-byte aligned int from the array   */
/* for the semaphore.                                                   */
#define __PA_LDCW_ALIGNMENT 16
#define __ldcw_align(a) ({ \
  unsigned long __ret = (unsigned long) a;                      \
  __ret += __PA_LDCW_ALIGNMENT - 1;                                     \
  __ret &= ~(__PA_LDCW_ALIGNMENT - 1);                                  \
  (volatile unsigned int *) __ret;                                      \
})

/* Works on PA 1.1 and PA 2.0 systems */
AO_INLINE AO_TS_VAL_t
AO_test_and_set_full(volatile AO_TS_t * addr)
{
  volatile unsigned int *a = __ldcw_align (addr);
  return (AO_TS_VAL_t) __ldcw (a);
}

AO_INLINE void
AO_pa_clear(volatile AO_TS_t * addr)
{
  volatile unsigned int *a = __ldcw_align (addr);
  AO_compiler_barrier();
  *a = 1;
}
#define AO_CLEAR(addr) AO_pa_clear(addr)

#define AO_HAVE_test_and_set_full


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/gcc/ia64.h
================================================
/*
 * Copyright (c) 2003 Hewlett-Packard Development Company, L.P.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "../all_atomic_load_store.h"

#include "../all_acquire_release_volatile.h"

#include "../test_and_set_t_is_char.h"

#ifdef _ILP32
  /* 32-bit HP/UX code. */
  /* This requires pointer "swizzling".  Pointers need to be expanded   */
  /* to 64 bits using the addp4 instruction before use.  This makes it  */
  /* hard to share code, but we try anyway.                             */
# define AO_LEN "4"
  /* We assume that addr always appears in argument position 1 in asm   */
  /* code.  If it is clobbered due to swizzling, we also need it in     */
  /* second position.  Any later arguments are referenced symbolically, */
  /* so that we don't have to worry about their position.  This requires*/
  /* gcc 3.1, but you shouldn't be using anything older than that on    */
  /* IA64 anyway.                                                       */
  /* The AO_MASK macro is a workaround for the fact that HP/UX gcc      */
  /* appears to otherwise store 64-bit pointers in ar.ccv, i.e. it      */
  /* doesn't appear to clear high bits in a pointer value we pass into  */
  /* assembly code, even if it is supposedly of type AO_t.              */
# define AO_IN_ADDR "1"(addr)
# define AO_OUT_ADDR , "=r"(addr)
# define AO_SWIZZLE "addp4 %1=0,%1;;\n"
# define AO_MASK(ptr) __asm__("zxt4 %1=%1": "=r"(ptr) : "0"(ptr));
#else
# define AO_LEN "8"
# define AO_IN_ADDR "r"(addr)
# define AO_OUT_ADDR
# define AO_SWIZZLE
# define AO_MASK(ptr)
#endif

AO_INLINE void
AO_nop_full(void)
{
  __asm__ __volatile__("mf" : : : "memory");
}
#define AO_HAVE_nop_full

AO_INLINE AO_t
AO_fetch_and_add1_acquire (volatile AO_t *addr)
{
  AO_t result;

  __asm__ __volatile__ (AO_SWIZZLE
                        "fetchadd" AO_LEN ".acq %0=[%1],1":
                        "=r" (result) AO_OUT_ADDR: AO_IN_ADDR :"memory");
  return result;
}
#define AO_HAVE_fetch_and_add1_acquire

AO_INLINE AO_t
AO_fetch_and_add1_release (volatile AO_t *addr)
{
  AO_t result;

  __asm__ __volatile__ (AO_SWIZZLE
                        "fetchadd" AO_LEN ".rel %0=[%1],1":
                        "=r" (result) AO_OUT_ADDR: AO_IN_ADDR :"memory");
  return result;
}

#define AO_HAVE_fetch_and_add1_release

AO_INLINE AO_t
AO_fetch_and_sub1_acquire (volatile AO_t *addr)
{
  AO_t result;

  __asm__ __volatile__ (AO_SWIZZLE
                        "fetchadd" AO_LEN ".acq %0=[%1],-1":
                        "=r" (result) AO_OUT_ADDR: AO_IN_ADDR :"memory");
  return result;
}

#define AO_HAVE_fetch_and_sub1_acquire

AO_INLINE AO_t
AO_fetch_and_sub1_release (volatile AO_t *addr)
{
  AO_t result;

  __asm__ __volatile__ (AO_SWIZZLE
                        "fetchadd" AO_LEN ".rel %0=[%1],-1":
                        "=r" (result) AO_OUT_ADDR: AO_IN_ADDR :"memory");
  return result;
}

#define AO_HAVE_fetch_and_sub1_release

#ifndef _ILP32

AO_INLINE unsigned int
AO_int_fetch_and_add1_acquire (volatile unsigned int *addr)
{
  unsigned int result;

  __asm__ __volatile__ ("fetchadd4.acq %0=[%1],1":
                        "=r" (result): AO_IN_ADDR :"memory");
  return result;
}
#define AO_HAVE_int_fetch_and_add1_acquire

AO_INLINE unsigned int
AO_int_fetch_and_add1_release (volatile unsigned int *addr)
{
  unsigned int result;

  __asm__ __volatile__ ("fetchadd4.rel %0=[%1],1":
                        "=r" (result): AO_IN_ADDR :"memory");
  return result;
}

#define AO_HAVE_int_fetch_and_add1_release

AO_INLINE unsigned int
AO_int_fetch_and_sub1_acquire (volatile unsigned int *addr)
{
  unsigned int result;

  __asm__ __volatile__ ("fetchadd4.acq %0=[%1],-1":
                        "=r" (result): AO_IN_ADDR :"memory");
  return result;
}

#define AO_HAVE_int_fetch_and_sub1_acquire

AO_INLINE unsigned int
AO_int_fetch_and_sub1_release (volatile unsigned int *addr)
{
  unsigned int result;

  __asm__ __volatile__ ("fetchadd4.rel %0=[%1],-1":
                        "=r" (result): AO_IN_ADDR :"memory");
  return result;
}

#define AO_HAVE_int_fetch_and_sub1_release

#endif /* !_ILP32 */

AO_INLINE int
AO_compare_and_swap_acquire(volatile AO_t *addr,
                             AO_t old, AO_t new_val)
{
  AO_t oldval;
  AO_MASK(old);
  __asm__ __volatile__(AO_SWIZZLE
                       "mov ar.ccv=%[old] ;; cmpxchg" AO_LEN
                       ".acq %0=[%1],%[new_val],ar.ccv"
                       : "=r"(oldval) AO_OUT_ADDR
                       : AO_IN_ADDR, [new_val]"r"(new_val), [old]"r"(old)
                       : "memory");
  return (oldval == old);
}

#define AO_HAVE_compare_and_swap_acquire

AO_INLINE int
AO_compare_and_swap_release(volatile AO_t *addr,
                             AO_t old, AO_t new_val)
{
  AO_t oldval;
  AO_MASK(old);
  __asm__ __volatile__(AO_SWIZZLE
                       "mov ar.ccv=%[old] ;; cmpxchg" AO_LEN
                       ".rel %0=[%1],%[new_val],ar.ccv"
                       : "=r"(oldval) AO_OUT_ADDR
                       : AO_IN_ADDR, [new_val]"r"(new_val), [old]"r"(old)
                       : "memory");
  return (oldval == old);
}

#define AO_HAVE_compare_and_swap_release

AO_INLINE int
AO_char_compare_and_swap_acquire(volatile unsigned char *addr,
                                 unsigned char old, unsigned char new_val)
{
  unsigned char oldval;
  __asm__ __volatile__(AO_SWIZZLE
               "mov ar.ccv=%[old] ;; cmpxchg1.acq %0=[%1],%[new_val],ar.ccv"
               : "=r"(oldval) AO_OUT_ADDR
               : AO_IN_ADDR, [new_val]"r"(new_val), [old]"r"((AO_t)old)
               : "memory");
  return (oldval == old);
}

#define AO_HAVE_char_compare_and_swap_acquire

AO_INLINE int
AO_char_compare_and_swap_release(volatile unsigned char *addr,
                                 unsigned char old, unsigned char new_val)
{
  unsigned char oldval;
  __asm__ __volatile__(AO_SWIZZLE
                "mov ar.ccv=%[old] ;; cmpxchg1.rel %0=[%1],%[new_val],ar.ccv"
                : "=r"(oldval) AO_OUT_ADDR
                : AO_IN_ADDR, [new_val]"r"(new_val), [old]"r"((AO_t)old)
                : "memory");
  return (oldval == old);
}

#define AO_HAVE_char_compare_and_swap_release

AO_INLINE int
AO_short_compare_and_swap_acquire(volatile unsigned short *addr,
                                  unsigned short old, unsigned short new_val)
{
  unsigned short oldval;
  __asm__ __volatile__(AO_SWIZZLE
                "mov ar.ccv=%[old] ;; cmpxchg2.acq %0=[%1],%[new_val],ar.ccv"
                : "=r"(oldval) AO_OUT_ADDR
                : AO_IN_ADDR, [new_val]"r"(new_val), [old]"r"((AO_t)old)
                : "memory");
  return (oldval == old);
}

#define AO_HAVE_short_compare_and_swap_acquire

AO_INLINE int
AO_short_compare_and_swap_release(volatile unsigned short *addr,
                                  unsigned short old, unsigned short new_val)
{
  unsigned short oldval;
  __asm__ __volatile__(AO_SWIZZLE
                "mov ar.ccv=%[old] ;; cmpxchg2.rel %0=[%1],%[new_val],ar.ccv"
                : "=r"(oldval) AO_OUT_ADDR
                : AO_IN_ADDR, [new_val]"r"(new_val), [old]"r"((AO_t)old)
                : "memory");
  return (oldval == old);
}

#define AO_HAVE_short_compare_and_swap_release

#ifndef _ILP32

AO_INLINE int
AO_int_compare_and_swap_acquire(volatile unsigned int *addr,
                                unsigned int old, unsigned int new_val)
{
  unsigned int oldval;
  __asm__ __volatile__("mov ar.ccv=%3 ;; cmpxchg4.acq %0=[%1],%2,ar.ccv"
                       : "=r"(oldval)
                       : AO_IN_ADDR, "r"(new_val), "r"((AO_t)old) : "memory");
  return (oldval == old);
}

#define AO_HAVE_int_compare_and_swap_acquire

AO_INLINE int
AO_int_compare_and_swap_release(volatile unsigned int *addr,
                                unsigned int old, unsigned int new_val)
{
  unsigned int oldval;
  __asm__ __volatile__("mov ar.ccv=%3 ;; cmpxchg4.rel %0=[%1],%2,ar.ccv"
                       : "=r"(oldval)
                       : AO_IN_ADDR, "r"(new_val), "r"((AO_t)old) : "memory");
  return (oldval == old);
}

#define AO_HAVE_int_compare_and_swap_release

#endif /* !_ILP32 */

/* FIXME: Add compare_and_swap_double as soon as there is widely        */
/* available hardware that implements it.                               */

/* FIXME: Add compare_double_and_swap_double for the _ILP32 case.       */

#ifdef _ILP32
# include "../ao_t_is_int.h"
#endif


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/gcc/m68k.h
================================================
/*
 * Copyright (c) 1991-1994 by Xerox Corporation.  All rights reserved.
 * Copyright (c) 1996-1999 by Silicon Graphics.  All rights reserved.
 * Copyright (c) 1999-2003 by Hewlett-Packard Company. All rights reserved.
 *
 *
 * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
 * OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
 *
 * Permission is hereby granted to use or copy this program
 * for any purpose,  provided the above notices are retained on all copies.
 * Permission to modify the code and to distribute modified code is granted,
 * provided the above notices are retained, and a notice that the code was
 * modified is included with the above copyright notice.
 *
 */

/* The cas instruction causes an emulation trap for the */
/* 060 with a misaligned pointer, so let's avoid this.  */
#undef AO_t
typedef unsigned long AO_t __attribute__ ((aligned (4)));

/* FIXME.  Very incomplete.  */
#include "../all_aligned_atomic_load_store.h"

/* Are there any m68k multiprocessors still around?     */
/* AFAIK, Alliants were sequentially consistent.        */
#include "../ordered.h"

#include "../test_and_set_t_is_char.h"

/* Contributed by Tony Mantler or new.  Should be changed to MIT license? */
AO_INLINE AO_TS_VAL_t
AO_test_and_set_full(volatile AO_TS_t *addr) {
  AO_TS_t oldval;

  /* The value at addr is semi-phony.   */
  /* 'tas' sets bit 7 while the return  */
  /* value pretends all bits were set,  */
  /* which at least matches AO_TS_SET.  */
  __asm__ __volatile__(
                "tas %1; sne %0"
                : "=d" (oldval), "=m" (*addr)
                : "m" (*addr)
                : "memory");
   return oldval;
}

#define AO_HAVE_test_and_set_full

/* Returns nonzero if the comparison succeeded. */
AO_INLINE int
AO_compare_and_swap_full(volatile AO_t *addr,
                         AO_t old, AO_t new_val)
{
  char result;

  __asm__ __volatile__(
                "cas.l %3,%4,%1; seq %0"
                : "=d" (result), "=m" (*addr)
                : "m" (*addr), "d" (old), "d" (new_val)
                : "memory");
  return -result;
}

#define AO_HAVE_compare_and_swap_full


#include "../ao_t_is_int.h"


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/gcc/mips.h
================================================
/*
 * Copyright (c) 2005,2007  Thiemo Seufer <ths@networkno.de>
 *
 * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
 * OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
 *
 * Permission is hereby granted to use or copy this program
 * for any purpose,  provided the above notices are retained on all copies.
 * Permission to modify the code and to distribute modified code is granted,
 * provided the above notices are retained, and a notice that the code was
 * modified is included with the above copyright notice.
 */

/*
 * FIXME:  This should probably make finer distinctions.  SGI MIPS is
 * much more strongly ordered, and in fact closer to sequentially
 * consistent.  This is really aimed at modern embedded implementations.
 * It looks to me like this assumes a 32-bit ABI.  -HB
 */

#include "../all_aligned_atomic_load_store.h"
#include "../acquire_release_volatile.h"
#include "../test_and_set_t_is_ao_t.h"
#include "../standard_ao_double_t.h"

/* Data dependence does not imply read ordering.  */
#define AO_NO_DD_ORDERING

AO_INLINE void
AO_nop_full(void)
{
  __asm__ __volatile__(
      "       .set push           \n"
      "       .set mips2          \n"
      "       .set noreorder      \n"
      "       .set nomacro        \n"
      "       sync                \n"
      "       .set pop              "
      : : : "memory");
}

#define AO_HAVE_nop_full

AO_INLINE int
AO_compare_and_swap(volatile AO_t *addr, AO_t old, AO_t new_val)
{
  register int was_equal = 0;
  register int temp;

  __asm__ __volatile__(
      "       .set push           \n"
      "       .set mips2          \n"
      "       .set noreorder      \n"
      "       .set nomacro        \n"
      "1:     ll      %0, %1      \n"
      "       bne     %0, %4, 2f  \n"
      "        move   %0, %3      \n"
      "       sc      %0, %1      \n"
      "       .set pop            \n"
      "       beqz    %0, 1b      \n"
      "       li      %2, 1       \n"
      "2:                           "
      : "=&r" (temp), "+R" (*addr), "+r" (was_equal)
      : "r" (new_val), "r" (old)
      : "memory");
  return was_equal;
}

#define AO_HAVE_compare_and_swap

/* FIXME: I think the implementations below should be automatically     */
/* generated if we omit them.  - HB                                     */

AO_INLINE int
AO_compare_and_swap_acquire(volatile AO_t *addr, AO_t old, AO_t new_val) {
  int result = AO_compare_and_swap(addr, old, new_val);
  AO_nop_full();
  return result;
}

#define AO_HAVE_compare_and_swap_acquire

AO_INLINE int
AO_compare_and_swap_release(volatile AO_t *addr, AO_t old, AO_t new_val) {
  AO_nop_full();
  return AO_compare_and_swap(addr, old, new_val);
}

#define AO_HAVE_compare_and_swap_release

AO_INLINE int
AO_compare_and_swap_full(volatile AO_t *addr, AO_t old, AO_t new_val) {
  AO_t result;
  AO_nop_full();
  result = AO_compare_and_swap(addr, old, new_val);
  AO_nop_full();
  return result;
}

#define AO_HAVE_compare_and_swap_full

/*
 * FIXME: We should also implement fetch_and_add and or primitives
 * directly.
 */

#include "../ao_t_is_int.h"


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/gcc/powerpc.h
================================================
/*
 * Copyright (c) 1991-1994 by Xerox Corporation.  All rights reserved.
 * Copyright (c) 1996-1999 by Silicon Graphics.  All rights reserved.
 * Copyright (c) 1999-2004 Hewlett-Packard Development Company, L.P.
 *
 *
 * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
 * OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
 *
 * Permission is hereby granted to use or copy this program
 * for any purpose,  provided the above notices are retained on all copies.
 * Permission to modify the code and to distribute modified code is granted,
 * provided the above notices are retained, and a notice that the code was
 * modified is included with the above copyright notice.
 *
 */

/* Memory model documented at http://www-106.ibm.com/developerworks/    */
/* eserver/articles/archguide.html and (clearer)                        */
/* http://www-106.ibm.com/developerworks/eserver/articles/powerpc.html. */
/* There appears to be no implicit ordering between any kind of         */
/* independent memory references.                                       */
/* Architecture enforces some ordering based on control dependence.     */
/* I don't know if that could help.                                     */
/* Data-dependent loads are always ordered.                             */
/* Based on the above references, eieio is intended for use on          */
/* uncached memory, which we don't support.  It does not order loads    */
/* from cached memory.                                                  */
/* Thanks to Maged Michael, Doug Lea, and Roger Hoover for helping to   */
/* track some of this down and correcting my misunderstandings. -HB     */
/* Earl Chew subsequently contributed further fixes & additions.        */

#include "../all_aligned_atomic_load_store.h"

#include "../test_and_set_t_is_ao_t.h"
        /* There seems to be no byte equivalent of lwarx, so this       */
        /* may really be what we want, at least in the 32-bit case.     */

AO_INLINE void
AO_nop_full(void)
{
  __asm__ __volatile__("sync" : : : "memory");
}

#define AO_HAVE_nop_full

/* lwsync apparently works for everything but a StoreLoad barrier.      */
AO_INLINE void
AO_lwsync(void)
{
#ifdef __NO_LWSYNC__
  __asm__ __volatile__("sync" : : : "memory");
#else
  __asm__ __volatile__("lwsync" : : : "memory");
#endif
}

#define AO_nop_write() AO_lwsync()
#define AO_HAVE_nop_write

#define AO_nop_read() AO_lwsync()
#define AO_HAVE_nop_read

/* We explicitly specify load_acquire, since it is important, and can   */
/* be implemented relatively cheaply.  It could be implemented          */
/* with an ordinary load followed by a lwsync.  But the general wisdom  */
/* seems to be that a data dependent branch followed by an isync is     */
/* cheaper.  And the documentation is fairly explicit that this also    */
/* has acquire semantics.                                               */
/* ppc64 uses ld not lwz */
#if defined(__powerpc64__) || defined(__ppc64__) || defined(__64BIT__)
AO_INLINE AO_t
AO_load_acquire(const volatile AO_t *addr)
{
  AO_t result;

   __asm__ __volatile__ (
    "ld%U1%X1 %0,%1\n"
    "cmpw %0,%0\n"
    "bne- 1f\n"
    "1: isync\n"
    : "=r" (result)
    : "m"(*addr) : "memory", "cr0");
  return result;
}
#else
AO_INLINE AO_t
AO_load_acquire(const volatile AO_t *addr)
{
  AO_t result;

  /* FIXME: We should get gcc to allocate one of the condition  */
  /* registers.  I always got "impossible constraint" when I    */
  /* tried the "y" constraint.                                  */
  __asm__ __volatile__ (
    "lwz%U1%X1 %0,%1\n"
    "cmpw %0,%0\n"
    "bne- 1f\n"
    "1: isync\n"
    : "=r" (result)
    : "m"(*addr) : "memory", "cc");
  return result;
}
#endif
#define AO_HAVE_load_acquire

/* We explicitly specify store_release, since it relies         */
/* on the fact that lwsync is also a LoadStore barrier.         */
AO_INLINE void
AO_store_release(volatile AO_t *addr, AO_t value)
{
  AO_lwsync();
  *addr = value;
}

#define AO_HAVE_load_acquire

/* This is similar to the code in the garbage collector.  Deleting      */
/* this and having it synthesized from compare_and_swap would probably  */
/* only cost us a load immediate instruction.                           */
#if defined(__powerpc64__) || defined(__ppc64__) || defined(__64BIT__)
/* Completely untested.  And we should be using smaller objects anyway. */
AO_INLINE AO_TS_VAL_t
AO_test_and_set(volatile AO_TS_t *addr) {
  unsigned long oldval;
  unsigned long temp = 1; /* locked value */

  __asm__ __volatile__(
               "1:ldarx %0,0,%1\n"   /* load and reserve               */
               "cmpdi %0, 0\n"       /* if load is                     */
               "bne 2f\n"            /*   non-zero, return already set */
               "stdcx. %2,0,%1\n"    /* else store conditional         */
               "bne- 1b\n"           /* retry if lost reservation      */
               "2:\n"                /* oldval is zero if we set       */
              : "=&r"(oldval)
              : "r"(addr), "r"(temp)
              : "memory", "cr0");

  return (AO_TS_VAL_t)oldval;
}

#else

AO_INLINE AO_TS_VAL_t
AO_test_and_set(volatile AO_TS_t *addr) {
  int oldval;
  int temp = 1; /* locked value */

  __asm__ __volatile__(
               "1:lwarx %0,0,%1\n"   /* load and reserve               */
               "cmpwi %0, 0\n"       /* if load is                     */
               "bne 2f\n"            /*   non-zero, return already set */
               "stwcx. %2,0,%1\n"    /* else store conditional         */
               "bne- 1b\n"           /* retry if lost reservation      */
               "2:\n"                /* oldval is zero if we set       */
              : "=&r"(oldval)
              : "r"(addr), "r"(temp)
              : "memory", "cr0");

  return (AO_TS_VAL_t)oldval;
}

#endif

#define AO_HAVE_test_and_set

AO_INLINE AO_TS_VAL_t
AO_test_and_set_acquire(volatile AO_TS_t *addr) {
  AO_TS_VAL_t result = AO_test_and_set(addr);
  AO_lwsync();
  return result;
}

#define AO_HAVE_test_and_set_acquire

AO_INLINE AO_TS_VAL_t
AO_test_and_set_release(volatile AO_TS_t *addr) {
  AO_lwsync();
  return AO_test_and_set(addr);
}

#define AO_HAVE_test_and_set_release

AO_INLINE AO_TS_VAL_t
AO_test_and_set_full(volatile AO_TS_t *addr) {
  AO_TS_VAL_t result;
  AO_lwsync();
  result = AO_test_and_set(addr);
  AO_lwsync();
  return result;
}

#define AO_HAVE_test_and_set_full

#if defined(__powerpc64__) || defined(__ppc64__) || defined(__64BIT__)
/* FIXME: Completely untested.  */
AO_INLINE int
AO_compare_and_swap(volatile AO_t *addr, AO_t old, AO_t new_val) {
  AO_t oldval;
  int result = 0;

  __asm__ __volatile__(
               "1:ldarx %0,0,%2\n"   /* load and reserve              */
               "cmpd %0, %4\n"      /* if load is not equal to  */
               "bne 2f\n"            /*   old, fail                     */
               "stdcx. %3,0,%2\n"    /* else store conditional         */
               "bne- 1b\n"           /* retry if lost reservation      */
               "li %1,1\n"           /* result = 1;                     */
               "2:\n"
              : "=&r"(oldval), "=&r"(result)
              : "r"(addr), "r"(new_val), "r"(old), "1"(result)
              : "memory", "cr0");

  return result;
}

#else

AO_INLINE int
AO_compare_and_swap(volatile AO_t *addr, AO_t old, AO_t new_val) {
  AO_t oldval;
  int result = 0;

  __asm__ __volatile__(
               "1:lwarx %0,0,%2\n"   /* load and reserve              */
               "cmpw %0, %4\n"      /* if load is not equal to  */
               "bne 2f\n"            /*   old, fail                     */
               "stwcx. %3,0,%2\n"    /* else store conditional         */
               "bne- 1b\n"           /* retry if lost reservation      */
               "li %1,1\n"           /* result = 1;                     */
               "2:\n"
              : "=&r"(oldval), "=&r"(result)
              : "r"(addr), "r"(new_val), "r"(old), "1"(result)
              : "memory", "cr0");

  return result;
}
#endif

#define AO_HAVE_compare_and_swap

AO_INLINE int
AO_compare_and_swap_acquire(volatile AO_t *addr, AO_t old, AO_t new_val) {
  int result = AO_compare_and_swap(addr, old, new_val);
  AO_lwsync();
  return result;
}

#define AO_HAVE_compare_and_swap_acquire

AO_INLINE int
AO_compare_and_swap_release(volatile AO_t *addr, AO_t old, AO_t new_val) {
  AO_lwsync();
  return AO_compare_and_swap(addr, old, new_val);
}

#define AO_HAVE_compare_and_swap_release

AO_INLINE int
AO_compare_and_swap_full(volatile AO_t *addr, AO_t old, AO_t new_val) {
  AO_t result;
  AO_lwsync();
  result = AO_compare_and_swap(addr, old, new_val);
  AO_lwsync();
  return result;
}

#define AO_HAVE_compare_and_swap_full

#if defined(__powerpc64__) || defined(__ppc64__) || defined(__64BIT__)
/* FIXME: Completely untested.                                          */

AO_INLINE AO_t
AO_fetch_and_add(volatile AO_t *addr, AO_t incr) {
  AO_t oldval;
  AO_t newval;

  __asm__ __volatile__(
               "1:ldarx %0,0,%2\n"   /* load and reserve                */
               "add %1,%0,%3\n"      /* increment                       */
               "stdcx. %1,0,%2\n"    /* store conditional               */
               "bne- 1b\n"           /* retry if lost reservation       */
              : "=&r"(oldval), "=&r"(newval)
               : "r"(addr), "r"(incr)
              : "memory", "cr0");

  return oldval;
}

#define AO_HAVE_fetch_and_add

#else

AO_INLINE AO_t
AO_fetch_and_add(volatile AO_t *addr, AO_t incr) {
  AO_t oldval;
  AO_t newval;

  __asm__ __volatile__(
               "1:lwarx %0,0,%2\n"   /* load and reserve                */
               "add %1,%0,%3\n"      /* increment                       */
               "stwcx. %1,0,%2\n"    /* store conditional               */
               "bne- 1b\n"           /* retry if lost reservation       */
              : "=&r"(oldval), "=&r"(newval)
               : "r"(addr), "r"(incr)
              : "memory", "cr0");

  return oldval;
}

#define AO_HAVE_fetch_and_add

#endif

AO_INLINE AO_t
AO_fetch_and_add_acquire(volatile AO_t *addr, AO_t incr) {
  AO_t result = AO_fetch_and_add(addr, incr);
  AO_lwsync();
  return result;
}

#define AO_HAVE_fetch_and_add_acquire

AO_INLINE AO_t
AO_fetch_and_add_release(volatile AO_t *addr, AO_t incr) {
  AO_lwsync();
  return AO_fetch_and_add(addr, incr);
}

#define AO_HAVE_fetch_and_add_release

AO_INLINE AO_t
AO_fetch_and_add_full(volatile AO_t *addr, AO_t incr) {
  AO_t result;
  AO_lwsync();
  result = AO_fetch_and_add(addr, incr);
  AO_lwsync();
  return result;
}

#define AO_HAVE_fetch_and_add_full

#if defined(__powerpc64__) || defined(__ppc64__) || defined(__64BIT__)
#else
# include "../ao_t_is_int.h"
#endif


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/gcc/s390.h
================================================
/*
 * Copyright (c) 1991-1994 by Xerox Corporation.  All rights reserved.
 * Copyright (c) 1996-1999 by Silicon Graphics.  All rights reserved.
 * Copyright (c) 1999-2003 by Hewlett-Packard Company. All rights reserved.
 *
 *
 * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
 * OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
 *
 * Permission is hereby granted to use or copy this program
 * for any purpose,  provided the above notices are retained on all copies.
 * Permission to modify the code and to distribute modified code is granted,
 * provided the above notices are retained, and a notice that the code was
 * modified is included with the above copyright notice.
 *
 */

/* FIXME: untested.                                             */
/* The relevant documentation appears to be at                  */
/* http://publibz.boulder.ibm.com/epubs/pdf/dz9zr003.pdf        */
/* around page 5-96.  Apparently:                               */
/* - Memory references in general are atomic only for a single  */
/*   byte.  But it appears that the most common load/store      */
/*   instructions also guarantee atomicity for aligned          */
/*   operands of standard types.  WE FOOLISHLY ASSUME that      */
/*   compilers only generate those.  If that turns out to be    */
/*   wrong, we need inline assembly code for AO_load and        */
/*   AO_store.                                                  */
/* - A store followed by a load is unordered since the store    */
/*   may be delayed.  Otherwise everything is ordered.          */
/* - There is a hardware compare-and-swap (CS) instruction.     */

#include "../ordered_except_wr.h"
#include "../all_aligned_atomic_load_store.h"

#include "../test_and_set_t_is_ao_t.h"
/* FIXME: Is there a way to do byte-sized test-and-set? */

/* FIXME: AO_nop_full should probably be implemented directly.  */
/* It appears that certain BCR instructions have that effect.   */
/* Presumably they're cheaper than CS?                          */

AO_INLINE AO_t AO_compare_and_swap_full(volatile AO_t *addr,
                                               AO_t old, AO_t new_val)
{
  int retval;
  __asm__ __volatile__ (
# ifndef __s390x__
    "     cs  %1,%2,0(%3)\n"
# else
    "     csg %1,%2,0(%3)\n"
# endif
  "     ipm %0\n"
  "     srl %0,28\n"
  : "=&d" (retval), "+d" (old)
  : "d" (new_val), "a" (addr)
  : "cc", "memory");
  return retval == 0;
}

#define AO_HAVE_compare_and_swap_full

/* FIXME: Add double-wide compare-and-swap for 32-bit executables.      */


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/gcc/sh.h
================================================
/*
 * Copyright (c) 2009 by Takashi YOSHII. All rights reserved.
 *
 *
 * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
 * OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
 *
 * Permission is hereby granted to use or copy this program
 * for any purpose,  provided the above notices are retained on all copies.
 * Permission to modify the code and to distribute modified code is granted,
 * provided the above notices are retained, and a notice that the code was
 * modified is included with the above copyright notice.
 */

#include "../all_atomic_load_store.h"
#include "../ordered.h"

/* sh has tas.b(byte) only */
#include "../test_and_set_t_is_char.h"

AO_INLINE AO_TS_VAL_t
AO_test_and_set_full(volatile AO_TS_t *addr)
{
  int oldval;
  __asm__ __volatile__(
        "tas.b @%1; movt %0"
        : "=r" (oldval)
        : "r" (addr)
        : "t", "memory");
  return oldval? AO_TS_CLEAR : AO_TS_SET;
}
#define AO_HAVE_test_and_set_full


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/gcc/sparc.h
================================================
/*
 * Copyright (c) 1991-1994 by Xerox Corporation.  All rights reserved.
 * Copyright (c) 1996-1999 by Silicon Graphics.  All rights reserved.
 * Copyright (c) 1999-2003 by Hewlett-Packard Company. All rights reserved.
 *
 *
 * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
 * OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
 *
 * Permission is hereby granted to use or copy this program
 * for any purpose,  provided the above notices are retained on all copies.
 * Permission to modify the code and to distribute modified code is granted,
 * provided the above notices are retained, and a notice that the code was
 * modified is included with the above copyright notice.
 *
 */

/* FIXME.  Very incomplete.  No support for sparc64.    */
/* Non-ancient SPARCs provide compare-and-swap (casa).  */
/* We should make that available.                       */

#include "../all_atomic_load_store.h"

/* Real SPARC code uses TSO:                            */
#include "../ordered_except_wr.h"

/* Test_and_set location is just a byte.                */
#include "../test_and_set_t_is_char.h"

AO_INLINE AO_TS_VAL_t
AO_test_and_set_full(volatile AO_TS_t *addr) {
   AO_TS_VAL_t oldval;

   __asm__ __volatile__("ldstub %1,%0"
                        : "=r"(oldval), "=m"(*addr)
                        : "m"(*addr) : "memory");
   return oldval;
}

#define AO_HAVE_test_and_set_full

#ifndef AO_NO_SPARC_V9
/* Returns nonzero if the comparison succeeded. */
AO_INLINE int
AO_compare_and_swap_full(volatile AO_t *addr, AO_t old, AO_t new_val) {
  char ret;
  __asm__ __volatile__ ("membar #StoreLoad | #LoadLoad\n\t"
#                       if defined(__arch64__)
                          "casx [%2],%0,%1\n\t"
#                       else
                          "cas [%2],%0,%1\n\t" /* 32-bit version */
#                       endif
                        "membar #StoreLoad | #StoreStore\n\t"
                        "cmp %0,%1\n\t"
                        "be,a 0f\n\t"
                        "mov 1,%0\n\t"/* one insn after branch always executed */
                        "clr %0\n\t"
                        "0:\n\t"
                        : "=r" (ret), "+r" (new_val)
                        : "r" (addr), "0" (old)
                        : "memory", "cc");
  return (int)ret;
}

#define AO_HAVE_compare_and_swap_full
#endif /* AO_NO_SPARC_V9 */

/* FIXME: This needs to be extended for SPARC v8 and v9.        */
/* SPARC V8 also has swap.  V9 has CAS.                         */
/* There are barriers like membar #LoadStore.                   */
/* CASA (32-bit) and CASXA(64-bit) instructions were            */
/* added in V9.                                                 */


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/gcc/x86.h
================================================
/*
 * Copyright (c) 1991-1994 by Xerox Corporation.  All rights reserved.
 * Copyright (c) 1996-1999 by Silicon Graphics.  All rights reserved.
 * Copyright (c) 1999-2003 by Hewlett-Packard Company. All rights reserved.
 *
 *
 * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
 * OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
 *
 * Permission is hereby granted to use or copy this program
 * for any purpose,  provided the above notices are retained on all copies.
 * Permission to modify the code and to distribute modified code is granted,
 * provided the above notices are retained, and a notice that the code was
 * modified is included with the above copyright notice.
 *
 * Some of the machine specific code was borrowed from our GC distribution.
 */

/* The following really assume we have a 486 or better.  Unfortunately  */
/* gcc doesn't define a suitable feature test macro based on command    */
/* line options.                                                        */
/* We should perhaps test dynamically.                                  */

#include "../all_aligned_atomic_load_store.h"

/* Real X86 implementations, except for some old WinChips, appear       */
/* to enforce ordering between memory operations, EXCEPT that a later   */
/* read can pass earlier writes, presumably due to the visible          */
/* presence of store buffers.                                           */
/* We ignore both the WinChips, and the fact that the official specs    */
/* seem to be much weaker (and arguably too weak to be usable).         */

#include "../ordered_except_wr.h"

#include "../test_and_set_t_is_char.h"

#include "../standard_ao_double_t.h"

#if defined(AO_USE_PENTIUM4_INSTRS)
AO_INLINE void
AO_nop_full(void)
{
  __asm__ __volatile__("mfence" : : : "memory");
}

#define AO_HAVE_nop_full

#else

/* We could use the cpuid instruction.  But that seems to be slower     */
/* than the default implementation based on test_and_set_full.  Thus    */
/* we omit that bit of misinformation here.                             */

#endif

/* As far as we can tell, the lfence and sfence instructions are not    */
/* currently needed or useful for cached memory accesses.               */

/* Really only works for 486 and later */
AO_INLINE AO_t
AO_fetch_and_add_full (volatile AO_t *p, AO_t incr)
{
  AO_t result;

  __asm__ __volatile__ ("lock; xaddl %0, %1" :
                        "=r" (result), "=m" (*p) : "0" (incr), "m" (*p)
                        : "memory");
  return result;
}

#define AO_HAVE_fetch_and_add_full

AO_INLINE unsigned char
AO_char_fetch_and_add_full (volatile unsigned char *p, unsigned char incr)
{
  unsigned char result;

  __asm__ __volatile__ ("lock; xaddb %0, %1" :
                        "=q" (result), "=m" (*p) : "0" (incr), "m" (*p)
                        : "memory");
  return result;
}

#define AO_HAVE_char_fetch_and_add_full

AO_INLINE unsigned short
AO_short_fetch_and_add_full (volatile unsigned short *p, unsigned short incr)
{
  unsigned short result;

  __asm__ __volatile__ ("lock; xaddw %0, %1" :
                        "=r" (result), "=m" (*p) : "0" (incr), "m" (*p)
                        : "memory");
  return result;
}

#define AO_HAVE_short_fetch_and_add_full

/* Really only works for 486 and later */
AO_INLINE void
AO_or_full (volatile AO_t *p, AO_t incr)
{
  __asm__ __volatile__ ("lock; orl %1, %0" :
                        "=m" (*p) : "r" (incr), "m" (*p) : "memory");
}

#define AO_HAVE_or_full

AO_INLINE AO_TS_VAL_t
AO_test_and_set_full(volatile AO_TS_t *addr)
{
  unsigned char oldval;
  /* Note: the "xchg" instruction does not need a "lock" prefix */
  __asm__ __volatile__("xchgb %0, %1"
                : "=q"(oldval), "=m"(*addr)
                : "0"(0xff), "m"(*addr) : "memory");
  return (AO_TS_VAL_t)oldval;
}

#define AO_HAVE_test_and_set_full

/* Returns nonzero if the comparison succeeded. */
AO_INLINE int
AO_compare_and_swap_full(volatile AO_t *addr, AO_t old, AO_t new_val)
{
# ifdef AO_USE_SYNC_CAS_BUILTIN
    return (int)__sync_bool_compare_and_swap(addr, old, new_val);
# else
    char result;
    __asm__ __volatile__("lock; cmpxchgl %3, %0; setz %1"
                         : "=m" (*addr), "=a" (result)
                         : "m" (*addr), "r" (new_val), "a" (old) : "memory");
    return (int)result;
# endif
}

#define AO_HAVE_compare_and_swap_full

/* Returns nonzero if the comparison succeeded. */
/* Really requires at least a Pentium.          */
AO_INLINE int
AO_compare_double_and_swap_double_full(volatile AO_double_t *addr,
                                       AO_t old_val1, AO_t old_val2,
                                       AO_t new_val1, AO_t new_val2)
{
  char result;
#if __PIC__
  /* If PIC is turned on, we can't use %ebx as it is reserved for the
     GOT pointer.  We can save and restore %ebx because GCC won't be
     using it for anything else (such as any of the m operands) */
  __asm__ __volatile__("pushl %%ebx;"   /* save ebx used for PIC GOT ptr */
                       "movl %6,%%ebx;" /* move new_val2 to %ebx */
                       "lock; cmpxchg8b %0; setz %1;"
                       "pop %%ebx;"     /* restore %ebx */
                       : "=m"(*addr), "=a"(result)
                       : "m"(*addr), "d" (old_val2), "a" (old_val1),
                         "c" (new_val2), "m" (new_val1) : "memory");
#else
  /* We can't just do the same thing in non-PIC mode, because GCC
   * might be using %ebx as the memory operand.  We could have ifdef'd
   * in a clobber, but there's no point doing the push/pop if we don't
   * have to. */
  __asm__ __volatile__("lock; cmpxchg8b %0; setz %1;"
                       : "=m"(*addr), "=a"(result)
                       : "m"(*addr), "d" (old_val2), "a" (old_val1),
                         "c" (new_val2), "b" (new_val1) : "memory");
#endif
  return (int) result;
}

#define AO_HAVE_compare_double_and_swap_double_full

#include "../ao_t_is_int.h"


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/gcc/x86_64.h
================================================
/*
 * Copyright (c) 1991-1994 by Xerox Corporation.  All rights reserved.
 * Copyright (c) 1996-1999 by Silicon Graphics.  All rights reserved.
 * Copyright (c) 1999-2003 by Hewlett-Packard Company. All rights reserved.
 *
 *
 * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
 * OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
 *
 * Permission is hereby granted to use or copy this program
 * for any purpose,  provided the above notices are retained on all copies.
 * Permission to modify the code and to distribute modified code is granted,
 * provided the above notices are retained, and a notice that the code was
 * modified is included with the above copyright notice.
 *
 * Some of the machine specific code was borrowed from our GC distribution.
 */

#include "../all_aligned_atomic_load_store.h"

/* Real X86 implementations appear                                      */
/* to enforce ordering between memory operations, EXCEPT that a later   */
/* read can pass earlier writes, presumably due to the visible          */
/* presence of store buffers.                                           */
/* We ignore the fact that the official specs                           */
/* seem to be much weaker (and arguably too weak to be usable).         */

#include "../ordered_except_wr.h"

#include "../test_and_set_t_is_char.h"

#include "../standard_ao_double_t.h"

AO_INLINE void
AO_nop_full(void)
{
  /* Note: "mfence" (SSE2) is supported on all x86_64/amd64 chips.      */
  __asm__ __volatile__("mfence" : : : "memory");
}

#define AO_HAVE_nop_full

/* As far as we can tell, the lfence and sfence instructions are not    */
/* currently needed or useful for cached memory accesses.               */

AO_INLINE AO_t
AO_fetch_and_add_full (volatile AO_t *p, AO_t incr)
{
  AO_t result;

  __asm__ __volatile__ ("lock; xaddq %0, %1" :
                        "=r" (result), "=m" (*p) : "0" (incr), "m" (*p)
                        : "memory");
  return result;
}

#define AO_HAVE_fetch_and_add_full

AO_INLINE unsigned char
AO_char_fetch_and_add_full (volatile unsigned char *p, unsigned char incr)
{
  unsigned char result;

  __asm__ __volatile__ ("lock; xaddb %0, %1" :
                        "=q" (result), "=m" (*p) : "0" (incr), "m" (*p)
                        : "memory");
  return result;
}

#define AO_HAVE_char_fetch_and_add_full

AO_INLINE unsigned short
AO_short_fetch_and_add_full (volatile unsigned short *p, unsigned short incr)
{
  unsigned short result;

  __asm__ __volatile__ ("lock; xaddw %0, %1" :
                        "=r" (result), "=m" (*p) : "0" (incr), "m" (*p)
                        : "memory");
  return result;
}

#define AO_HAVE_short_fetch_and_add_full

AO_INLINE unsigned int
AO_int_fetch_and_add_full (volatile unsigned int *p, unsigned int incr)
{
  unsigned int result;

  __asm__ __volatile__ ("lock; xaddl %0, %1" :
                        "=r" (result), "=m" (*p) : "0" (incr), "m" (*p)
                        : "memory");
  return result;
}

#define AO_HAVE_int_fetch_and_add_full

AO_INLINE void
AO_or_full (volatile AO_t *p, AO_t incr)
{
  __asm__ __volatile__ ("lock; orq %1, %0" :
                        "=m" (*p) : "r" (incr), "m" (*p) : "memory");
}

#define AO_HAVE_or_full

AO_INLINE AO_TS_VAL_t
AO_test_and_set_full(volatile AO_TS_t *addr)
{
  unsigned char oldval;
  /* Note: the "xchg" instruction does not need a "lock" prefix */
  __asm__ __volatile__("xchgb %0, %1"
                : "=q"(oldval), "=m"(*addr)
                : "0"(0xff), "m"(*addr) : "memory");
  return (AO_TS_VAL_t)oldval;
}

#define AO_HAVE_test_and_set_full

/* Returns nonzero if the comparison succeeded. */
AO_INLINE int
AO_compare_and_swap_full(volatile AO_t *addr, AO_t old, AO_t new_val)
{
# ifdef AO_USE_SYNC_CAS_BUILTIN
    return (int)__sync_bool_compare_and_swap(addr, old, new_val);
# else
    char result;
    __asm__ __volatile__("lock; cmpxchgq %3, %0; setz %1"
                         : "=m" (*addr), "=a" (result)
                         : "m" (*addr), "r" (new_val), "a" (old) : "memory");
    return (int) result;
# endif
}

#define AO_HAVE_compare_and_swap_full

#ifdef AO_CMPXCHG16B_AVAILABLE
/* NEC LE-IT: older AMD Opterons are missing this instruction.
 * On these machines SIGILL will be thrown.
 * Define AO_WEAK_DOUBLE_CAS_EMULATION to have an emulated
 * (lock based) version available */
/* HB: Changed this to not define either by default.  There are
 * enough machines and tool chains around on which cmpxchg16b
 * doesn't work.  And the emulation is unsafe by our usual rules.
 * Hoewever both are clearly useful in certain cases.
 */
AO_INLINE int
AO_compare_double_and_swap_double_full(volatile AO_double_t *addr,
                                       AO_t old_val1, AO_t old_val2,
                                       AO_t new_val1, AO_t new_val2)
{
  char result;
  __asm__ __volatile__("lock; cmpxchg16b %0; setz %1"
                       : "=m"(*addr), "=a"(result)
                       : "m"(*addr), "d" (old_val2), "a" (old_val1),
                         "c" (new_val2), "b" (new_val1) : "memory");
  return (int) result;
}
#define AO_HAVE_compare_double_and_swap_double_full
#else
/* this one provides spinlock based emulation of CAS implemented in     */
/* atomic_ops.c.  We probably do not want to do this here, since it is  */
/* not atomic with respect to other kinds of updates of *addr.  On the  */
/* other hand, this may be a useful facility on occasion.               */
#ifdef AO_WEAK_DOUBLE_CAS_EMULATION
int AO_compare_double_and_swap_double_emulation(volatile AO_double_t *addr,
                                                AO_t old_val1, AO_t old_val2,
                                                AO_t new_val1, AO_t new_val2);

AO_INLINE int
AO_compare_double_and_swap_double_full(volatile AO_double_t *addr,
                                       AO_t old_val1, AO_t old_val2,
                                       AO_t new_val1, AO_t new_val2)
{
        return AO_compare_double_and_swap_double_emulation(addr,
                                                           old_val1, old_val2,
                                                           new_val1, new_val2);
}
#define AO_HAVE_compare_double_and_swap_double_full
#endif /* AO_WEAK_DOUBLE_CAS_EMULATION */
#endif /* AO_CMPXCHG16B_AVAILABLE */


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/generic_pthread.h
================================================
/*
 * Copyright (c) 2003 by Hewlett-Packard Company.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

/* The following is useful primarily for debugging and documentation.   */
/* We define various atomic operations by acquiring a global pthread    */
/* lock.  The resulting implementation will perform poorly, but should  */
/* be correct unless it is used from signal handlers.                   */
/* We assume that all pthread operations act like full memory barriers. */
/* (We believe that is the intent of the specification.)                */

#include <pthread.h>

#include "test_and_set_t_is_ao_t.h"
        /* This is not necessarily compatible with the native           */
        /* implementation.  But those can't be safely mixed anyway.     */

/* We define only the full barrier variants, and count on the           */
/* generalization section below to fill in the rest.                    */
extern pthread_mutex_t AO_pt_lock;

AO_INLINE void
AO_nop_full(void)
{
  pthread_mutex_lock(&AO_pt_lock);
  pthread_mutex_unlock(&AO_pt_lock);
}

#define AO_HAVE_nop_full

AO_INLINE AO_t
AO_load_full(const volatile AO_t *addr)
{
  AO_t result;
  pthread_mutex_lock(&AO_pt_lock);
  result = *addr;
  pthread_mutex_unlock(&AO_pt_lock);
  return result;
}

#define AO_HAVE_load_full

AO_INLINE void
AO_store_full(volatile AO_t *addr, AO_t val)
{
  pthread_mutex_lock(&AO_pt_lock);
  *addr = val;
  pthread_mutex_unlock(&AO_pt_lock);
}

#define AO_HAVE_store_full

AO_INLINE unsigned char
AO_char_load_full(const volatile unsigned char *addr)
{
  unsigned char result;
  pthread_mutex_lock(&AO_pt_lock);
  result = *addr;
  pthread_mutex_unlock(&AO_pt_lock);
  return result;
}

#define AO_HAVE_char_load_full

AO_INLINE void
AO_char_store_full(volatile unsigned char *addr, unsigned char val)
{
  pthread_mutex_lock(&AO_pt_lock);
  *addr = val;
  pthread_mutex_unlock(&AO_pt_lock);
}

#define AO_HAVE_char_store_full

AO_INLINE unsigned short
AO_short_load_full(const volatile unsigned short *addr)
{
  unsigned short result;
  pthread_mutex_lock(&AO_pt_lock);
  result = *addr;
  pthread_mutex_unlock(&AO_pt_lock);
  return result;
}

#define AO_HAVE_short_load_full

AO_INLINE void
AO_short_store_full(volatile unsigned short *addr, unsigned short val)
{
  pthread_mutex_lock(&AO_pt_lock);
  *addr = val;
  pthread_mutex_unlock(&AO_pt_lock);
}

#define AO_HAVE_short_store_full

AO_INLINE unsigned int
AO_int_load_full(const volatile unsigned int *addr)
{
  unsigned int result;
  pthread_mutex_lock(&AO_pt_lock);
  result = *addr;
  pthread_mutex_unlock(&AO_pt_lock);
  return result;
}

#define AO_HAVE_int_load_full

AO_INLINE void
AO_int_store_full(volatile unsigned int *addr, unsigned int val)
{
  pthread_mutex_lock(&AO_pt_lock);
  *addr = val;
  pthread_mutex_unlock(&AO_pt_lock);
}

#define AO_HAVE_int_store_full

AO_INLINE AO_TS_VAL_t
AO_test_and_set_full(volatile AO_TS_t *addr)
{
  AO_TS_VAL_t result;
  pthread_mutex_lock(&AO_pt_lock);
  result = (AO_TS_VAL_t)(*addr);
  *addr = AO_TS_SET;
  pthread_mutex_unlock(&AO_pt_lock);
  assert(result == AO_TS_SET || result == AO_TS_CLEAR);
  return result;
}

#define AO_HAVE_test_and_set_full

AO_INLINE AO_t
AO_fetch_and_add_full(volatile AO_t *p, AO_t incr)
{
  AO_t tmp;

  pthread_mutex_lock(&AO_pt_lock);
  tmp = *p;
  *p = tmp + incr;
  pthread_mutex_unlock(&AO_pt_lock);
  return tmp;
}

#define AO_HAVE_fetch_and_add_full

AO_INLINE unsigned char
AO_char_fetch_and_add_full(volatile unsigned char *p, unsigned char incr)
{
  unsigned char tmp;

  pthread_mutex_lock(&AO_pt_lock);
  tmp = *p;
  *p = tmp + incr;
  pthread_mutex_unlock(&AO_pt_lock);
  return tmp;
}

#define AO_HAVE_char_fetch_and_add_full

AO_INLINE unsigned short
AO_short_fetch_and_add_full(volatile unsigned short *p, unsigned short incr)
{
  unsigned short tmp;

  pthread_mutex_lock(&AO_pt_lock);
  tmp = *p;
  *p = tmp + incr;
  pthread_mutex_unlock(&AO_pt_lock);
  return tmp;
}

#define AO_HAVE_short_fetch_and_add_full

AO_INLINE unsigned int
AO_int_fetch_and_add_full(volatile unsigned int *p, unsigned int incr)
{
  unsigned int tmp;

  pthread_mutex_lock(&AO_pt_lock);
  tmp = *p;
  *p = tmp + incr;
  pthread_mutex_unlock(&AO_pt_lock);
  return tmp;
}

#define AO_HAVE_int_fetch_and_add_full

AO_INLINE void
AO_or_full(volatile AO_t *p, AO_t incr)
{
  AO_t tmp;

  pthread_mutex_lock(&AO_pt_lock);
  tmp = *p;
  *p = (tmp | incr);
  pthread_mutex_unlock(&AO_pt_lock);
}

#define AO_HAVE_or_full

AO_INLINE int
AO_compare_and_swap_full(volatile AO_t *addr,
                             AO_t old, AO_t new_val)
{
  pthread_mutex_lock(&AO_pt_lock);
  if (*addr == old)
    {
      *addr = new_val;
      pthread_mutex_unlock(&AO_pt_lock);
      return 1;
    }
  else
    pthread_mutex_unlock(&AO_pt_lock);
    return 0;
}

#define AO_HAVE_compare_and_swap_full

/* Unlike real architectures, we define both double-width CAS variants. */

typedef struct {
        AO_t AO_val1;
        AO_t AO_val2;
} AO_double_t;

#define AO_HAVE_double_t

AO_INLINE int
AO_compare_double_and_swap_double_full(volatile AO_double_t *addr,
                                       AO_t old1, AO_t old2,
                                       AO_t new1, AO_t new2)
{
  pthread_mutex_lock(&AO_pt_lock);
  if (addr -> AO_val1 == old1 && addr -> AO_val2 == old2)
    {
      addr -> AO_val1 = new1;
      addr -> AO_val2 = new2;
      pthread_mutex_unlock(&AO_pt_lock);
      return 1;
    }
  else
    pthread_mutex_unlock(&AO_pt_lock);
    return 0;
}

#define AO_HAVE_compare_double_and_swap_double_full

AO_INLINE int
AO_compare_and_swap_double_full(volatile AO_double_t *addr,
                                AO_t old1,
                                AO_t new1, AO_t new2)
{
  pthread_mutex_lock(&AO_pt_lock);
  if (addr -> AO_val1 == old1)
    {
      addr -> AO_val1 = new1;
      addr -> AO_val2 = new2;
      pthread_mutex_unlock(&AO_pt_lock);
      return 1;
    }
  else
    pthread_mutex_unlock(&AO_pt_lock);
    return 0;
}

#define AO_HAVE_compare_and_swap_double_full

/* We can't use hardware loads and stores, since they don't     */
/* interact correctly with atomic updates.                      */


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/hpc/hppa.h
================================================
/*
 * Copyright (c) 2003 Hewlett-Packard Development Company, L.P.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 * Derived from the corresponding header file for gcc.
 *
 */

#include "../atomic_load_store.h"

/* Some architecture set descriptions include special "ordered" memory  */
/* operations.  As far as we can tell, no existing processors actually  */
/* require those.  Nor does it appear likely that future processors     */
/* will.                                                                */
/* FIXME:                                                               */
/* The PA emulator on Itanium may obey weaker restrictions.             */
/* There should be a mode in which we don't assume sequential           */
/* consistency here.                                                    */
#include "../ordered.h"

#include <machine/inline.h>

/* GCC will not guarantee the alignment we need, use four lock words    */
/* and select the correctly aligned datum. See the glibc 2.3.2          */
/* linuxthread port for the original implementation.                    */
struct AO_pa_clearable_loc {
  int data[4];
};

#undef AO_TS_INITIALIZER
#define AO_TS_t struct AO_pa_clearable_loc
#define AO_TS_INITIALIZER {1,1,1,1}
/* Switch meaning of set and clear, since we only have an atomic clear  */
/* instruction.                                                         */
typedef enum {AO_PA_TS_set = 0, AO_PA_TS_clear = 1} AO_PA_TS_val;
#define AO_TS_VAL_t AO_PA_TS_val
#define AO_TS_CLEAR AO_PA_TS_clear
#define AO_TS_SET AO_PA_TS_set

/* The hppa only has one atomic read and modify memory operation,       */
/* load and clear, so hppa spinlocks must use zero to signify that      */
/* someone is holding the lock.  The address used for the ldcw          */
/* semaphore must be 16-byte aligned.                                   */

#define __ldcw(a, ret)  \
  _LDCWX(0 /* index */, 0 /* s */, a /* base */, ret);

/* Because malloc only guarantees 8-byte alignment for malloc'd data,   */
/* and GCC only guarantees 8-byte alignment for stack locals, we can't  */
/* be assured of 16-byte alignment for atomic lock data even if we      */
/* specify "__attribute ((aligned(16)))" in the type declaration.  So,  */
/* we use a struct containing an array of four ints for the atomic lock */
/* type and dynamically select the 16-byte aligned int from the array   */
/* for the semaphore.                                                   */
#define __PA_LDCW_ALIGNMENT 16

#define __ldcw_align(a, ret) { \
  ret = (unsigned long) a;                      \
  ret += __PA_LDCW_ALIGNMENT - 1;                                       \
  ret &= ~(__PA_LDCW_ALIGNMENT - 1);                                    \
}

/* Works on PA 1.1 and PA 2.0 systems */
AO_INLINE AO_TS_VAL_t
AO_test_and_set_full(volatile AO_TS_t * addr)
{
  register unsigned int ret;
  register unsigned long a;
  __ldcw_align (addr, a);
  __ldcw (a, ret);
  return ret;
}

AO_INLINE void
AO_pa_clear(volatile AO_TS_t * addr)
{
  unsigned long a;
  __ldcw_align (addr,a);
  AO_compiler_barrier();
  *(volatile unsigned int *)a = 1;
}
#define AO_CLEAR(addr) AO_pa_clear(addr)

#define AO_HAVE_test_and_set_full


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/hpc/ia64.h
================================================
/*
 * Copyright (c) 2003 Hewlett-Packard Development Company, L.P.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

/*
 * This file specifies Itanimum primitives for use with the HP compiler
 * under HP/UX.  We use intrinsics instead of the inline assembly code in the
 * gcc file.
 */

#include "../all_atomic_load_store.h"

#include "../all_acquire_release_volatile.h"

#include "../test_and_set_t_is_char.h"

#include <machine/sys/inline.h>

#ifdef __LP64__
# define AO_T_FASIZE _FASZ_D
# define AO_T_SIZE _SZ_D
#else
# define AO_T_FASIZE _FASZ_W
# define AO_T_SIZE _SZ_W
#endif

AO_INLINE void
AO_nop_full(void)
{
  _Asm_mf();
}
#define AO_HAVE_nop_full

AO_INLINE AO_t
AO_fetch_and_add1_acquire (volatile AO_t *p)
{
  return _Asm_fetchadd(AO_T_FASIZE, _SEM_ACQ, p, 1,
                       _LDHINT_NONE, _DOWN_MEM_FENCE);
}
#define AO_HAVE_fetch_and_add1_acquire

AO_INLINE AO_t
AO_fetch_and_add1_release (volatile AO_t *p)
{
  return _Asm_fetchadd(AO_T_FASIZE, _SEM_REL, p, 1,
                       _LDHINT_NONE, _UP_MEM_FENCE);
}

#define AO_HAVE_fetch_and_add1_release

AO_INLINE AO_t
AO_fetch_and_sub1_acquire (volatile AO_t *p)
{
  return _Asm_fetchadd(AO_T_FASIZE, _SEM_ACQ, p, -1,
                       _LDHINT_NONE, _DOWN_MEM_FENCE);
}

#define AO_HAVE_fetch_and_sub1_acquire

AO_INLINE AO_t
AO_fetch_and_sub1_release (volatile AO_t *p)
{
  return _Asm_fetchadd(AO_T_FASIZE, _SEM_REL, p, -1,
                       _LDHINT_NONE, _UP_MEM_FENCE);
}

#define AO_HAVE_fetch_and_sub1_release

AO_INLINE int
AO_compare_and_swap_acquire(volatile AO_t *addr,
                             AO_t old, AO_t new_val)
{
  AO_t oldval;

  _Asm_mov_to_ar(_AREG_CCV, old, _DOWN_MEM_FENCE);
  oldval = _Asm_cmpxchg(AO_T_SIZE, _SEM_ACQ, addr,
                        new_val, _LDHINT_NONE, _DOWN_MEM_FENCE);
  return (oldval == old);
}

#define AO_HAVE_compare_and_swap_acquire

AO_INLINE int
AO_compare_and_swap_release(volatile AO_t *addr,
                             AO_t old, AO_t new_val)
{
  AO_t oldval;
  _Asm_mov_to_ar(_AREG_CCV, old, _UP_MEM_FENCE);
  oldval = _Asm_cmpxchg(AO_T_SIZE, _SEM_REL, addr,
                        new_val, _LDHINT_NONE, _UP_MEM_FENCE);
  /* Hopefully the compiler knows not to reorder the above two? */
  return (oldval == old);
}

#define AO_HAVE_compare_and_swap_release

AO_INLINE int
AO_char_compare_and_swap_acquire(volatile unsigned char *addr,
                                 unsigned char old, unsigned char new_val)
{
  unsigned char oldval;

  _Asm_mov_to_ar(_AREG_CCV, old, _DOWN_MEM_FENCE);
  oldval = _Asm_cmpxchg(_SZ_B, _SEM_ACQ, addr,
                        new_val, _LDHINT_NONE, _DOWN_MEM_FENCE);
  return (oldval == old);
}

#define AO_HAVE_char_compare_and_swap_acquire

AO_INLINE int
AO_char_compare_and_swap_release(volatile unsigned char *addr,
                                 unsigned char old, unsigned char new_val)
{
  unsigned char oldval;
  _Asm_mov_to_ar(_AREG_CCV, old, _UP_MEM_FENCE);
  oldval = _Asm_cmpxchg(_SZ_B, _SEM_REL, addr,
                        new_val, _LDHINT_NONE, _UP_MEM_FENCE);
  /* Hopefully the compiler knows not to reorder the above two? */
  return (oldval == old);
}

#define AO_HAVE_char_compare_and_swap_release

AO_INLINE int
AO_short_compare_and_swap_acquire(volatile unsigned short *addr,
                                 unsigned short old, unsigned short new_val)
{
  unsigned short oldval;

  _Asm_mov_to_ar(_AREG_CCV, old, _DOWN_MEM_FENCE);
  oldval = _Asm_cmpxchg(_SZ_B, _SEM_ACQ, addr,
                        new_val, _LDHINT_NONE, _DOWN_MEM_FENCE);
  return (oldval == old);
}

#define AO_HAVE_short_compare_and_swap_acquire

AO_INLINE int
AO_short_compare_and_swap_release(volatile unsigned short *addr,
                                 unsigned short old, unsigned short new_val)
{
  unsigned short oldval;
  _Asm_mov_to_ar(_AREG_CCV, old, _UP_MEM_FENCE);
  oldval = _Asm_cmpxchg(_SZ_B, _SEM_REL, addr,
                        new_val, _LDHINT_NONE, _UP_MEM_FENCE);
  /* Hopefully the compiler knows not to reorder the above two? */
  return (oldval == old);
}

#define AO_HAVE_short_compare_and_swap_release

#ifndef __LP64__
# include "../ao_t_is_int.h"
#endif


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/ibmc/powerpc.h
================================================
/* FIXME.  This is only a placeholder for the AIX compiler.             */
/* It doesn't work.  Please send a patch.                               */
/* Memory model documented at http://www-106.ibm.com/developerworks/    */
/* eserver/articles/archguide.html and (clearer)                        */
/* http://www-106.ibm.com/developerworks/eserver/articles/powerpc.html. */
/* There appears to be no implicit ordering between any kind of         */
/* independent memory references.                                       */
/* Architecture enforces some ordering based on control dependence.     */
/* I don't know if that could help.                                     */
/* Data-dependent loads are always ordered.                             */
/* Based on the above references, eieio is intended for use on          */
/* uncached memory, which we don't support.  It does not order loads    */
/* from cached memory.                                                  */
/* Thanks to Maged Michael, Doug Lea, and Roger Hoover for helping to   */
/* track some of this down and correcting my misunderstandings. -HB     */

#include "../all_aligned_atomic_load_store.h"

void AO_sync(void);
#pragma mc_func AO_sync { "7c0004ac" }

#ifdef __NO_LWSYNC__
# define AO_lwsync AO_sync
#else
  void AO_lwsync(void);
#pragma mc_func AO_lwsync { "7c2004ac" }
#endif

#define AO_nop_write() AO_lwsync()
#define AO_HAVE_nop_write

#define AO_nop_read() AO_lwsync()
#define AO_HAVE_nop_read

/* We explicitly specify load_acquire and store_release, since these    */
/* rely on the fact that lwsync is also a LoadStore barrier.            */
AO_INLINE AO_t
AO_load_acquire(const volatile AO_t *addr)
{
  AO_t result = *addr;
  AO_lwsync();
  return result;
}

#define AO_HAVE_load_acquire

AO_INLINE void
AO_store_release(volatile AO_t *addr, AO_t value)
{
  AO_lwsync();
  *addr = value;
}

#define AO_HAVE_load_acquire

/* This is similar to the code in the garbage collector.  Deleting      */
/* this and having it synthesized from compare_and_swap would probably  */
/* only cost us a load immediate instruction.                           */
/*AO_INLINE AO_TS_VAL_t
AO_test_and_set(volatile AO_TS_t *addr) {
# error FIXME Implement me
}

#define AO_HAVE_test_and_set*/

AO_INLINE AO_TS_VAL_t
AO_test_and_set_acquire(volatile AO_TS_t *addr) {
  AO_TS_VAL_t result = AO_test_and_set(addr);
  AO_lwsync();
  return result;
}

#define AO_HAVE_test_and_set_acquire

AO_INLINE AO_TS_VAL_t
AO_test_and_set_release(volatile AO_TS_t *addr) {
  AO_lwsync();
  return AO_test_and_set(addr);
}

#define AO_HAVE_test_and_set_release

AO_INLINE AO_TS_VAL_t
AO_test_and_set_full(volatile AO_TS_t *addr) {
  AO_TS_VAL_t result;
  AO_lwsync();
  result = AO_test_and_set(addr);
  AO_lwsync();
  return result;
}

#define AO_HAVE_test_and_set_full

/*AO_INLINE AO_t
AO_compare_and_swap(volatile AO_t *addr, AO_t old, AO_t new_val) {
# error FIXME Implement me
}

#define AO_HAVE_compare_and_swap*/

AO_INLINE AO_t
AO_compare_and_swap_acquire(volatile AO_t *addr, AO_t old, AO_t new_val) {
  AO_t result = AO_compare_and_swap(addr, old, new_val);
  AO_lwsync();
  return result;
}

#define AO_HAVE_compare_and_swap_acquire

AO_INLINE AO_t
AO_compare_and_swap_release(volatile AO_t *addr, AO_t old, AO_t new_val) {
  AO_lwsync();
  return AO_compare_and_swap(addr, old, new_val);
}

#define AO_HAVE_compare_and_swap_release

AO_INLINE AO_t
AO_compare_and_swap_full(volatile AO_t *addr, AO_t old, AO_t new_val) {
  AO_t result;
  AO_lwsync();
  result = AO_compare_and_swap(addr, old, new_val);
  AO_lwsync();
  return result;
}

#define AO_HAVE_compare_and_swap_full

/* FIXME: We should also implement fetch_and_add and or primitives      */
/* directly.                                                            */


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/icc/ia64.h
================================================
/*
 * Copyright (c) 2003 by Hewlett-Packard Company.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

/*
 * This file specifies Itanimum primitives for use with the Intel (ecc)
 * compiler.  We use intrinsics instead of the inline assembly code in the
 * gcc file.
 */

#include "../all_atomic_load_store.h"

#include "../test_and_set_t_is_char.h"

#include <ia64intrin.h>

/* The acquire release semantics of volatile can be turned off.  And volatile   */
/* operations in icc9 don't imply ordering with respect to other nonvolatile    */
/* operations.                                                                  */

#define AO_INTEL_PTR_t void *

AO_INLINE AO_t
AO_load_acquire(const volatile AO_t *p)
{
  return (AO_t)(__ld8_acq((AO_INTEL_PTR_t)p));
}
#define AO_HAVE_load_acquire

AO_INLINE void
AO_store_release(volatile AO_t *p, AO_t val)
{
  __st8_rel((AO_INTEL_PTR_t)p, (__int64)val);
}
#define AO_HAVE_store_release

AO_INLINE unsigned char
AO_char_load_acquire(const volatile unsigned char *p)
{
  /* A normal volatile load generates an ld.acq         */
  return (__ld1_acq((AO_INTEL_PTR_t)p));
}
#define AO_HAVE_char_load_acquire

AO_INLINE void
AO_char_store_release(volatile unsigned char *p, unsigned char val)
{
  __st1_rel((AO_INTEL_PTR_t)p, val);
}
#define AO_HAVE_char_store_release

AO_INLINE unsigned short
AO_short_load_acquire(const volatile unsigned short *p)
{
  /* A normal volatile load generates an ld.acq         */
  return (__ld2_acq((AO_INTEL_PTR_t)p));
}
#define AO_HAVE_short_load_acquire

AO_INLINE void
AO_short_store_release(volatile unsigned short *p, unsigned short val)
{
  __st2_rel((AO_INTEL_PTR_t)p, val);
}
#define AO_HAVE_short_store_release

AO_INLINE unsigned int
AO_int_load_acquire(const volatile unsigned int *p)
{
  /* A normal volatile load generates an ld.acq         */
  return (__ld4_acq((AO_INTEL_PTR_t)p));
}
#define AO_HAVE_int_load_acquire

AO_INLINE void
AO_int_store_release(volatile unsigned int *p, unsigned int val)
{
  __st4_rel((AO_INTEL_PTR_t)p, val);
}
#define AO_HAVE_int_store_release

AO_INLINE void
AO_nop_full(void)
{
  __mf();
}
#define AO_HAVE_nop_full

AO_INLINE AO_t
AO_fetch_and_add1_acquire (volatile AO_t *p)
{
  return __fetchadd8_acq((unsigned __int64 *)p, 1);
}
#define AO_HAVE_fetch_and_add1_acquire

AO_INLINE AO_t
AO_fetch_and_add1_release (volatile AO_t *p)
{
  return __fetchadd8_rel((unsigned __int64 *)p, 1);
}

#define AO_HAVE_fetch_and_add1_release

AO_INLINE AO_t
AO_fetch_and_sub1_acquire (volatile AO_t *p)
{
  return __fetchadd8_acq((unsigned __int64 *)p, -1);
}

#define AO_HAVE_fetch_and_sub1_acquire

AO_INLINE AO_t
AO_fetch_and_sub1_release (volatile AO_t *p)
{
  return __fetchadd8_rel((unsigned __int64 *)p, -1);
}

#define AO_HAVE_fetch_and_sub1_release

AO_INLINE int
AO_compare_and_swap_acquire(volatile AO_t *addr,
                             AO_t old, AO_t new_val)
{
  AO_t oldval;
  oldval = _InterlockedCompareExchange64_acq(addr, new_val, old);
  return (oldval == old);
}

#define AO_HAVE_compare_and_swap_acquire

AO_INLINE int
AO_compare_and_swap_release(volatile AO_t *addr,
                             AO_t old, AO_t new_val)
{
  AO_t oldval;
  oldval = _InterlockedCompareExchange64_rel(addr, new_val, old);
  return (oldval == old);
}

#define AO_HAVE_compare_and_swap_release

AO_INLINE int
AO_char_compare_and_swap_acquire(volatile unsigned char *addr,
                                 unsigned char old, unsigned char new_val)
{
  unsigned char oldval;
  oldval = _InterlockedCompareExchange8_acq(addr, new_val, old);
  return (oldval == old);
}

#define AO_HAVE_char_compare_and_swap_acquire

AO_INLINE int
AO_char_compare_and_swap_release(volatile unsigned char *addr,
                            unsigned char old, unsigned char new_val)
{
  unsigned char oldval;
  oldval = _InterlockedCompareExchange8_rel(addr, new_val, old);
  return (oldval == old);
}

#define AO_HAVE_char_compare_and_swap_release

AO_INLINE int
AO_short_compare_and_swap_acquire(volatile unsigned short *addr,
                                 unsigned short old, unsigned short new_val)
{
  unsigned short oldval;
  oldval = _InterlockedCompareExchange16_acq(addr, new_val, old);
  return (oldval == old);
}

#define AO_HAVE_short_compare_and_swap_acquire

AO_INLINE int
AO_short_compare_and_swap_release(volatile unsigned short *addr,
                            unsigned short old, unsigned short new_val)
{
  unsigned short oldval;
  oldval = _InterlockedCompareExchange16_rel(addr, new_val, old);
  return (oldval == old);
}

#define AO_HAVE_short_compare_and_swap_release

AO_INLINE int
AO_int_compare_and_swap_acquire(volatile unsigned int *addr,
                                 unsigned int old, unsigned int new_val)
{
  unsigned int oldval;
  oldval = _InterlockedCompareExchange_acq(addr, new_val, old);
  return (oldval == old);
}

#define AO_HAVE_int_compare_and_swap_acquire

AO_INLINE int
AO_int_compare_and_swap_release(volatile unsigned int *addr,
                            unsigned int old, unsigned int new_val)
{
  unsigned int oldval;
  oldval = _InterlockedCompareExchange_rel(addr, new_val, old);
  return (oldval == old);
}

#define AO_HAVE_int_compare_and_swap_release


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/int_acquire_release_volatile.h
================================================
/*
 * Copyright (c) 2003-2004 Hewlett-Packard Development Company, L.P.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

/*
 * This file adds definitions appropriate for environments in which an unsigned
 * int volatile load has acquire semantics, and an unsigned short volatile
 * store has release semantics.  This is true with the standard Itanium ABI.
 */
#if !defined(AO_GCC_BARRIER)
#  if defined(__GNUC__)
#    define AO_GCC_BARRIER() AO_compiler_barrier()
#  else
#    define AO_GCC_BARRIER()
#  endif
#endif

AO_INLINE unsigned int
AO_int_load_acquire(const volatile unsigned int *p)
{
  unsigned int result = *p;
  /* A normal volatile load generates an ld.acq         */
  AO_GCC_BARRIER();
  return result;
}
#define AO_HAVE_int_load_acquire

AO_INLINE void
AO_int_store_release(volatile unsigned int *p, unsigned int val)
{
  AO_GCC_BARRIER();
  /* A normal volatile store generates an st.rel        */
  *p = val;
}
#define AO_HAVE_int_store_release


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/int_aligned_atomic_load_store.h
================================================
/*
 * Copyright (c) 2004 Hewlett-Packard Development Company, L.P.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

/*
 * Definitions for architectures on which loads and stores of unsigned int are
 * atomic for all legal alignments.
 */

AO_INLINE unsigned int
AO_int_load(const volatile unsigned int *addr)
{
  assert(((size_t)addr & (sizeof(unsigned int) - 1)) == 0);
  /* Cast away the volatile for architectures like IA64 where   */
  /* volatile adds barrier semantics.                           */
  return (*(unsigned int *)addr);
}

#define AO_HAVE_int_load

AO_INLINE void
AO_int_store(volatile unsigned int *addr, unsigned int new_val)
{
  assert(((size_t)addr & (sizeof(unsigned int) - 1)) == 0);
  (*(unsigned int *)addr) = new_val;
}

#define AO_HAVE_int_store


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/int_atomic_load_store.h
================================================
/*
 * Copyright (c) 2003 by Hewlett-Packard Company.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

/*
 * Definitions for architectures on which loads and stores of unsigned int are
 * atomic for all legal alignments.
 */

AO_INLINE unsigned int
AO_int_load(const volatile unsigned int *addr)
{
  /* Cast away the volatile for architectures like IA64 where   */
  /* volatile adds barrier semantics.                           */
  return (*(const unsigned int *)addr);
}

#define AO_HAVE_int_load

AO_INLINE void
AO_int_store(volatile unsigned int *addr, unsigned int new_val)
{
  (*(unsigned int *)addr) = new_val;
}

#define AO_HAVE_int_store


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/msftc/arm.h
================================================
/*
 * Copyright (c) 2003 Hewlett-Packard Development Company, L.P.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "../read_ordered.h"

#ifndef AO_ASSUME_WINDOWS98
  /* CAS is always available */
# define AO_ASSUME_WINDOWS98
#endif
#include "common32_defs.h"
/* FIXME: Do _InterlockedOps really have a full memory barrier?         */
/* (MSDN WinCE docs say nothing about it.)                              */

#if _M_ARM >= 6
/* ARMv6 is the first architecture providing support for simple LL/SC.  */

#include "../standard_ao_double_t.h"

/* If only a single processor is used, we can define AO_UNIPROCESSOR    */
/* and do not need to access CP15 for ensuring a DMB at all.            */
#ifdef AO_UNIPROCESSOR
  AO_INLINE void AO_nop_full(void) {}
# define AO_HAVE_nop_full
#else
/* AO_nop_full() is emulated using AO_test_and_set_full().              */
#endif

#include "../test_and_set_t_is_ao_t.h"
/* AO_test_and_set() is emulated using CAS.                             */

AO_INLINE AO_t
AO_load(const volatile AO_t *addr)
{
  /* Cast away the volatile in case it adds fence semantics */
  return (*(const AO_t *)addr);
}
#define AO_HAVE_load

AO_INLINE void
AO_store_full(volatile AO_t *addr, AO_t value)
{
  /* Emulate atomic store using CAS.    */
  AO_t old = AO_load(addr);
  AO_t current;
# ifdef AO_OLD_STYLE_INTERLOCKED_COMPARE_EXCHANGE
    while ((current = (AO_t)_InterlockedCompareExchange(
                                (PVOID AO_INTERLOCKED_VOLATILE *)addr,
                                (PVOID)value, (PVOID)old)) != old)
      old = current;
# else
    while ((current = (AO_t)_InterlockedCompareExchange(
                                (LONG AO_INTERLOCKED_VOLATILE *)addr,
                                (LONG)value, (LONG)old)) != old)
      old = current;
# endif
}
#define AO_HAVE_store_full

/* FIXME: implement AO_compare_double_and_swap_double() */

#else /* _M_ARM < 6 */

/* Some slide set, if it has been red correctly, claims that Loads      */
/* followed by either a Load or a Store are ordered, but nothing        */
/* else is. It appears that SWP is the only simple memory barrier.      */
#include "../all_atomic_load_store.h"

#include "../test_and_set_t_is_ao_t.h"
/* AO_test_and_set_full() is emulated using CAS.                        */

#endif /* _M_ARM < 6 */


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/msftc/common32_defs.h
================================================
/*
 * Copyright (c) 2003 Hewlett-Packard Development Company, L.P.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

/* This file contains AO primitives based on VC++ built-in intrinsic    */
/* functions commonly available across 32-bit architectures.            */

/* This file should be included from arch-specific header files.        */
/* Define AO_USE_INTERLOCKED_INTRINSICS if _Interlocked primitives      */
/* (used below) are available as intrinsic ones for a target arch       */
/* (otherwise "Interlocked" functions family is used instead).          */
/* Define AO_ASSUME_WINDOWS98 if CAS is available.                      */

#include <windows.h>
        /* Seems like over-kill, but that's what MSDN recommends.       */
        /* And apparently winbase.h is not always self-contained.       */

#if _MSC_VER < 1310 || !defined(AO_USE_INTERLOCKED_INTRINSICS)

# define _InterlockedIncrement       InterlockedIncrement
# define _InterlockedDecrement       InterlockedDecrement
# define _InterlockedExchange        InterlockedExchange
# define _InterlockedExchangeAdd     InterlockedExchangeAdd
# define _InterlockedCompareExchange InterlockedCompareExchange

# define AO_INTERLOCKED_VOLATILE /**/

#else /* elif _MSC_VER >= 1310 */

# if _MSC_VER >= 1400
#   ifndef _WIN32_WCE
#     include <intrin.h>
#   endif

#   pragma intrinsic (_ReadWriteBarrier)

# else /* elif _MSC_VER < 1400 */
#  ifdef __cplusplus
     extern "C" {
#  endif
   LONG __cdecl _InterlockedIncrement(LONG volatile *);
   LONG __cdecl _InterlockedDecrement(LONG volatile *);
   LONG __cdecl _InterlockedExchangeAdd(LONG volatile *, LONG);
   LONG __cdecl _InterlockedExchange(LONG volatile *, LONG);
   LONG __cdecl _InterlockedCompareExchange(LONG volatile *,
                                        LONG /* Exchange */, LONG /* Comp */);
#  ifdef __cplusplus
     }
#  endif
# endif /* _MSC_VER < 1400 */

# pragma intrinsic (_InterlockedIncrement)
# pragma intrinsic (_InterlockedDecrement)
# pragma intrinsic (_InterlockedExchange)
# pragma intrinsic (_InterlockedExchangeAdd)
# pragma intrinsic (_InterlockedCompareExchange)

# define AO_INTERLOCKED_VOLATILE volatile

#endif /* _MSC_VER >= 1310 */

AO_INLINE AO_t
AO_fetch_and_add_full(volatile AO_t *p, AO_t incr)
{
  return _InterlockedExchangeAdd((LONG AO_INTERLOCKED_VOLATILE *)p,
                                 (LONG)incr);
}

#define AO_HAVE_fetch_and_add_full

AO_INLINE AO_t
AO_fetch_and_add1_full(volatile AO_t *p)
{
  return _InterlockedIncrement((LONG AO_INTERLOCKED_VOLATILE *)p) - 1;
}

#define AO_HAVE_fetch_and_add1_full

AO_INLINE AO_t
AO_fetch_and_sub1_full(volatile AO_t *p)
{
  return _InterlockedDecrement((LONG AO_INTERLOCKED_VOLATILE *)p) + 1;
}

#define AO_HAVE_fetch_and_sub1_full

#ifdef AO_ASSUME_WINDOWS98
/* Returns nonzero if the comparison succeeded. */
AO_INLINE int
AO_compare_and_swap_full(volatile AO_t *addr, AO_t old, AO_t new_val)
{
# ifdef AO_OLD_STYLE_INTERLOCKED_COMPARE_EXCHANGE
    return _InterlockedCompareExchange((PVOID AO_INTERLOCKED_VOLATILE *)addr,
                                       (PVOID)new_val, (PVOID)old)
           == (PVOID)old;
# else
    return _InterlockedCompareExchange((LONG AO_INTERLOCKED_VOLATILE *)addr,
                                       (LONG)new_val, (LONG)old)
           == (LONG)old;
# endif
}

# define AO_HAVE_compare_and_swap_full
#endif /* AO_ASSUME_WINDOWS98 */


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/msftc/x86.h
================================================
/*
 * Copyright (c) 2003 Hewlett-Packard Development Company, L.P.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

/* If AO_ASSUME_WINDOWS98 is defined, we assume Windows 98 or newer.    */
/* If AO_ASSUME_VISTA is defined, we assume Windows Server 2003, Vista  */
/* or later.                                                            */

#include "../all_aligned_atomic_load_store.h"

/* Real X86 implementations, except for some old WinChips, appear       */
/* to enforce ordering between memory operations, EXCEPT that a later   */
/* read can pass earlier writes, presumably due to the visible          */
/* presence of store buffers.                                           */
/* We ignore both the WinChips, and the fact that the official specs    */
/* seem to be much weaker (and arguably too weak to be usable).         */

#include "../ordered_except_wr.h"

#include "../test_and_set_t_is_char.h"

#ifndef AO_USE_INTERLOCKED_INTRINSICS
  /* _Interlocked primitives (Inc, Dec, Xchg, Add) are always available */
# define AO_USE_INTERLOCKED_INTRINSICS
#endif
#include "common32_defs.h"

/* As far as we can tell, the lfence and sfence instructions are not    */
/* currently needed or useful for cached memory accesses.               */

/* Unfortunately mfence doesn't exist everywhere.               */
/* IsProcessorFeaturePresent(PF_COMPARE_EXCHANGE128) is         */
/* probably a conservative test for it?                         */

#if defined(AO_USE_PENTIUM4_INSTRS)

AO_INLINE void
AO_nop_full(void)
{
  __asm { mfence }
}

#define AO_HAVE_nop_full

#else

/* We could use the cpuid instruction.  But that seems to be slower     */
/* than the default implementation based on test_and_set_full.  Thus    */
/* we omit that bit of misinformation here.                             */

#endif

AO_INLINE AO_TS_VAL_t
AO_test_and_set_full(volatile AO_TS_t *addr)
{
    __asm
    {
        mov     eax,0xff                ; /* AO_TS_SET */
        mov     ebx,addr                ;
        xchg    byte ptr [ebx],al       ;
    }
    /* Ignore possible "missing return value" warning here. */
}

#define AO_HAVE_test_and_set_full

#ifdef _WIN64
#  error wrong architecture
#endif

#ifdef AO_ASSUME_VISTA
/* NEC LE-IT: whenever we run on a pentium class machine we have that
 * certain function */

#include "../standard_ao_double_t.h"
#pragma intrinsic (_InterlockedCompareExchange64)
/* Returns nonzero if the comparison succeeded. */
AO_INLINE int
AO_compare_double_and_swap_double_full(volatile AO_double_t *addr,
                                       AO_t old_val1, AO_t old_val2,
                                       AO_t new_val1, AO_t new_val2)
{
    __int64 oldv = (__int64)old_val1 | ((__int64)old_val2 << 32);
    __int64 newv = (__int64)new_val1 | ((__int64)new_val2 << 32);
    return _InterlockedCompareExchange64((__int64 volatile *)addr,
                                       newv, oldv) == oldv;
}
#define AO_HAVE_compare_double_and_swap_double_full

#ifdef __cplusplus
AO_INLINE int
AO_double_compare_and_swap_full(volatile AO_double_t *addr,
                                AO_double_t old_val, AO_double_t new_val)
{
    return _InterlockedCompareExchange64((__int64 volatile *)addr,
                new_val.AO_whole, old_val.AO_whole) == old_val.AO_whole;
}
#define AO_HAVE_double_compare_and_swap_full
#endif /* __cplusplus */
#endif /* AO_ASSUME_VISTA */

#include "../ao_t_is_int.h"


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/msftc/x86_64.h
================================================
/*
 * Copyright (c) 2003 Hewlett-Packard Development Company, L.P.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "../all_aligned_atomic_load_store.h"

/* Real X86 implementations appear                                      */
/* to enforce ordering between memory operations, EXCEPT that a later   */
/* read can pass earlier writes, presumably due to the visible          */
/* presence of store buffers.                                           */
/* We ignore the fact that the official specs                           */
/* seem to be much weaker (and arguably too weak to be usable).         */

#include "../ordered_except_wr.h"

#ifdef AO_ASM_X64_AVAILABLE
# include "../test_and_set_t_is_char.h"
#else
# include "../test_and_set_t_is_ao_t.h"
#endif

#include "../standard_ao_double_t.h"

#include <windows.h>
        /* Seems like over-kill, but that's what MSDN recommends.       */
        /* And apparently winbase.h is not always self-contained.       */

/* Assume _MSC_VER >= 1400 */
#include <intrin.h>

#pragma intrinsic (_ReadWriteBarrier)

#pragma intrinsic (_InterlockedIncrement64)
#pragma intrinsic (_InterlockedDecrement64)
#pragma intrinsic (_InterlockedExchange64)
#pragma intrinsic (_InterlockedExchangeAdd64)
#pragma intrinsic (_InterlockedCompareExchange64)

AO_INLINE AO_t
AO_fetch_and_add_full (volatile AO_t *p, AO_t incr)
{
  return _InterlockedExchangeAdd64((LONGLONG volatile *)p, (LONGLONG)incr);
}

#define AO_HAVE_fetch_and_add_full

AO_INLINE AO_t
AO_fetch_and_add1_full (volatile AO_t *p)
{
  return _InterlockedIncrement64((LONGLONG volatile *)p) - 1;
}

#define AO_HAVE_fetch_and_add1_full

AO_INLINE AO_t
AO_fetch_and_sub1_full (volatile AO_t *p)
{
  return _InterlockedDecrement64((LONGLONG volatile *)p) + 1;
}

#define AO_HAVE_fetch_and_sub1_full

AO_INLINE int
AO_compare_and_swap_full(volatile AO_t *addr,
                         AO_t old, AO_t new_val)
{
    return _InterlockedCompareExchange64((LONGLONG volatile *)addr,
                                         (LONGLONG)new_val, (LONGLONG)old)
           == (LONGLONG)old;
}

#define AO_HAVE_compare_and_swap_full

/* As far as we can tell, the lfence and sfence instructions are not    */
/* currently needed or useful for cached memory accesses.               */

#ifdef AO_ASM_X64_AVAILABLE

AO_INLINE void
AO_nop_full(void)
{
  /* Note: "mfence" (SSE2) is supported on all x86_64/amd64 chips.      */
  __asm { mfence }
}

#define AO_HAVE_nop_full

AO_INLINE AO_TS_VAL_t
AO_test_and_set_full(volatile AO_TS_t *addr)
{
    __asm
    {
        mov     rax,AO_TS_SET           ;
        mov     rbx,addr                ;
        xchg    byte ptr [rbx],al       ;
    }
}

#define AO_HAVE_test_and_set_full

#endif /* AO_ASM_X64_AVAILABLE */

#ifdef AO_CMPXCHG16B_AVAILABLE

/* AO_compare_double_and_swap_double_full needs implementation for Win64.
 * Also see ../gcc/x86_64.h for partial old Opteron workaround.
 */

# if _MSC_VER >= 1500

#pragma intrinsic (_InterlockedCompareExchange128)

AO_INLINE int
AO_compare_double_and_swap_double_full(volatile AO_double_t *addr,
                                       AO_t old_val1, AO_t old_val2,
                                       AO_t new_val1, AO_t new_val2)
{
   __int64 comparandResult[2];
   comparandResult[0] = old_val1; /* low */
   comparandResult[1] = old_val2; /* high */
   return _InterlockedCompareExchange128((volatile __int64 *)addr,
                new_val2 /* high */, new_val1 /* low */, comparandResult);
}

#   define AO_HAVE_compare_double_and_swap_double_full

# elif defined(AO_ASM_X64_AVAILABLE)

 /* If there is no intrinsic _InterlockedCompareExchange128 then we
  * need basically what's given below.
  */

AO_INLINE int
AO_compare_double_and_swap_double_full(volatile AO_double_t *addr,
                                       AO_t old_val1, AO_t old_val2,
                                       AO_t new_val1, AO_t new_val2)
{
        __asm
        {
                mov     rdx,QWORD PTR [old_val2]        ;
                mov     rax,QWORD PTR [old_val1]        ;
                mov     rcx,QWORD PTR [new_val2]        ;
                mov     rbx,QWORD PTR [new_val1]        ;
                lock cmpxchg16b [addr]                  ;
                setz    rax                             ;
        }
}

#   define AO_HAVE_compare_double_and_swap_double_full

# endif /* _MSC_VER >= 1500 || AO_ASM_X64_AVAILABLE */

#endif /* AO_CMPXCHG16B_AVAILABLE */


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/ordered.h
================================================
/*
 * Copyright (c) 2003 Hewlett-Packard Development Company, L.P.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

/*
 * These are common definitions for architectures that provide processor
 * ordered memory operations.
 */

#include "ordered_except_wr.h"

AO_INLINE void
AO_nop_full(void)
{
  AO_compiler_barrier();
}

#define AO_HAVE_nop_full


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/ordered_except_wr.h
================================================
/*
 * Copyright (c) 2003 by Hewlett-Packard Company.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

/*
 * These are common definitions for architectures that provide processor
 * ordered memory operations except that a later read may pass an
 * earlier write.  Real x86 implementations seem to be in this category,
 * except apparently for some IDT WinChips, which we ignore.
 */

#include "read_ordered.h"

AO_INLINE void
AO_nop_write(void)
{
  AO_compiler_barrier();
  /* sfence according to Intel docs.  Pentium 3 and up. */
  /* Unnecessary for cached accesses?                   */
}

#define AO_HAVE_NOP_WRITE

#if defined(AO_HAVE_store)

AO_INLINE void
AO_store_write(volatile AO_t *addr, AO_t val)
{
  AO_compiler_barrier();
  AO_store(addr, val);
}
# define AO_HAVE_store_write

# define AO_store_release(addr, val) AO_store_write(addr, val)
# define AO_HAVE_store_release

#endif /* AO_HAVE_store */

#if defined(AO_HAVE_char_store)

AO_INLINE void
AO_char_store_write(volatile unsigned char *addr, unsigned char val)
{
  AO_compiler_barrier();
  AO_char_store(addr, val);
}
# define AO_HAVE_char_store_write

# define AO_char_store_release(addr, val) AO_char_store_write(addr, val)
# define AO_HAVE_char_store_release

#endif /* AO_HAVE_char_store */

#if defined(AO_HAVE_short_store)

AO_INLINE void
AO_short_store_write(volatile unsigned short *addr, unsigned short val)
{
  AO_compiler_barrier();
  AO_short_store(addr, val);
}
# define AO_HAVE_short_store_write

# define AO_short_store_release(addr, val) AO_short_store_write(addr, val)
# define AO_HAVE_short_store_release

#endif /* AO_HAVE_short_store */

#if defined(AO_HAVE_int_store)

AO_INLINE void
AO_int_store_write(volatile unsigned int *addr, unsigned int val)
{
  AO_compiler_barrier();
  AO_int_store(addr, val);
}
# define AO_HAVE_int_store_write

# define AO_int_store_release(addr, val) AO_int_store_write(addr, val)
# define AO_HAVE_int_store_release

#endif /* AO_HAVE_int_store */


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/read_ordered.h
================================================
/*
 * Copyright (c) 2003 by Hewlett-Packard Company.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

/*
 * These are common definitions for architectures that provide processor
 * ordered memory operations except that a later read may pass an
 * earlier write.  Real x86 implementations seem to be in this category,
 * except apparently for some IDT WinChips, which we ignore.
 */

AO_INLINE void
AO_nop_read(void)
{
  AO_compiler_barrier();
}

#define AO_HAVE_NOP_READ

#ifdef AO_HAVE_load

AO_INLINE AO_t
AO_load_read(const volatile AO_t *addr)
{
  AO_t result = AO_load(addr);
  AO_compiler_barrier();
  return result;
}
#define AO_HAVE_load_read

#define AO_load_acquire(addr) AO_load_read(addr)
#define AO_HAVE_load_acquire

#endif /* AO_HAVE_load */

#ifdef AO_HAVE_char_load

AO_INLINE AO_t
AO_char_load_read(const volatile unsigned char *addr)
{
  AO_t result = AO_char_load(addr);
  AO_compiler_barrier();
  return result;
}
#define AO_HAVE_char_load_read

#define AO_char_load_acquire(addr) AO_char_load_read(addr)
#define AO_HAVE_char_load_acquire

#endif /* AO_HAVE_char_load */

#ifdef AO_HAVE_short_load

AO_INLINE AO_t
AO_short_load_read(const volatile unsigned short *addr)
{
  AO_t result = AO_short_load(addr);
  AO_compiler_barrier();
  return result;
}
#define AO_HAVE_short_load_read

#define AO_short_load_acquire(addr) AO_short_load_read(addr)
#define AO_HAVE_short_load_acquire

#endif /* AO_HAVE_short_load */

#ifdef AO_HAVE_int_load

AO_INLINE AO_t
AO_int_load_read(const volatile unsigned int *addr)
{
  AO_t result = AO_int_load(addr);
  AO_compiler_barrier();
  return result;
}
#define AO_HAVE_int_load_read

#define AO_int_load_acquire(addr) AO_int_load_read(addr)
#define AO_HAVE_int_load_acquire

#endif /* AO_HAVE_int_load */


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/short_acquire_release_volatile.h
================================================
/*
 * Copyright (c) 2003-2004 Hewlett-Packard Development Company, L.P.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

/*
 * This file adds definitions appropriate for environments in which an unsigned short
 * volatile load has acquire semantics, and an unsigned short volatile store has release
 * semantics.  This is true with the standard Itanium ABI.
 */
#if !defined(AO_GCC_BARRIER)
#  if defined(__GNUC__)
#    define AO_GCC_BARRIER() AO_compiler_barrier()
#  else
#    define AO_GCC_BARRIER()
#  endif
#endif

AO_INLINE unsigned short
AO_short_load_acquire(const volatile unsigned short *p)
{
  unsigned short result = *p;
  /* A normal volatile load generates an ld.acq         */
  AO_GCC_BARRIER();
  return result;
}
#define AO_HAVE_short_load_acquire

AO_INLINE void
AO_short_store_release(volatile unsigned short *p, unsigned short val)
{
  AO_GCC_BARRIER();
  /* A normal volatile store generates an st.rel        */
  *p = val;
}
#define AO_HAVE_short_store_release


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/short_aligned_atomic_load_store.h
================================================
/*
 * Copyright (c) 2004 Hewlett-Packard Development Company, L.P.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

/*
 * Definitions for architectures on which loads and stores of unsigned short
 * are atomic for all legal alignments.
 */

AO_INLINE unsigned short
AO_short_load(const volatile unsigned short *addr)
{
  assert(((size_t)addr & (sizeof(unsigned short) - 1)) == 0);
  /* Cast away the volatile for architectures like IA64 where   */
  /* volatile adds barrier semantics.                           */
  return (*(unsigned short *)addr);
}

#define AO_HAVE_short_load

AO_INLINE void
AO_short_store(volatile unsigned short *addr, unsigned short new_val)
{
  assert(((size_t)addr & (sizeof(unsigned short) - 1)) == 0);
  (*(unsigned short *)addr) = new_val;
}

#define AO_HAVE_short_store


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/short_atomic_load_store.h
================================================
/*
 * Copyright (c) 2003 by Hewlett-Packard Company.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

/*
 * Definitions for architectures on which loads and stores of unsigned short
 * are atomic for all legal alignments.
 */

AO_INLINE unsigned short
AO_short_load(const volatile unsigned short *addr)
{
  /* Cast away the volatile for architectures like IA64 where   */
  /* volatile adds barrier semantics.                           */
  return (*(const unsigned short *)addr);
}

#define AO_HAVE_short_load

AO_INLINE void
AO_short_store(volatile unsigned short *addr, unsigned short new_val)
{
  (*(unsigned short *)addr) = new_val;
}

#define AO_HAVE_short_store


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/standard_ao_double_t.h
================================================
/* NEC LE-IT: For 64Bit OS we extend the double type to hold two int64's
*
*  x86-64: __m128 serves as placeholder which also requires the compiler
*          to align     it on 16 byte boundary (as required by cmpxchg16.
* Similar things could be done for PowerPC 64bit using a VMX data type...       */

#if (defined(__x86_64__) && defined(__GNUC__)) || defined(_WIN64)
# include <xmmintrin.h>
  typedef __m128 double_ptr_storage;
#elif defined(_WIN32) && !defined(__GNUC__)
  typedef unsigned __int64 double_ptr_storage;
#else
  typedef unsigned long long double_ptr_storage;
#endif

# define AO_HAVE_DOUBLE_PTR_STORAGE

typedef union {
    double_ptr_storage AO_whole;
    struct {AO_t AO_v1; AO_t AO_v2;} AO_parts;
} AO_double_t;

#define AO_HAVE_double_t
#define AO_val1 AO_parts.AO_v1
#define AO_val2 AO_parts.AO_v2


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/sunc/sparc.h
================================================
/*
 * Copyright (c) 2004 Hewlett-Packard Development Company, L.P.
 * 
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE. 
 */

#include "../all_atomic_load_store.h"

/* Real SPARC code uses TSO:                            */
#include "../ordered_except_wr.h"

/* Test_and_set location is just a byte.		*/
#include "../test_and_set_t_is_char.h"

extern AO_TS_VAL_t
AO_test_and_set_full(volatile AO_TS_t *addr);
/* Implemented in separate .S file, for now.	*/

#define AO_HAVE_test_and_set_full

/* FIXME: Like the gcc version, this needs to be extended for V8 	*/
/* and V9.								*/


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/sunc/x86.h
================================================
/*
 * Copyright (c) 1991-1994 by Xerox Corporation.  All rights reserved.
 * Copyright (c) 1996-1999 by Silicon Graphics.  All rights reserved.
 * Copyright (c) 1999-2003 by Hewlett-Packard Company. All rights reserved.
 *
 *
 * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
 * OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
 *
 * Permission is hereby granted to use or copy this program
 * for any purpose,  provided the above notices are retained on all copies.
 * Permission to modify the code and to distribute modified code is granted,
 * provided the above notices are retained, and a notice that the code was
 * modified is included with the above copyright notice.
 *
 * Some of the machine specific code was borrowed from our GC distribution.
 */

/* The following really assume we have a 486 or better.                 */

#include "../all_aligned_atomic_load_store.h"

/* Real X86 implementations, except for some old WinChips, appear       */
/* to enforce ordering between memory operations, EXCEPT that a later   */
/* read can pass earlier writes, presumably due to the visible          */
/* presence of store buffers.                                           */
/* We ignore both the WinChips, and the fact that the official specs    */
/* seem to be much weaker (and arguably too weak to be usable).         */

#include "../ordered_except_wr.h"

#include "../test_and_set_t_is_char.h"

#include "../standard_ao_double_t.h"

#if defined(AO_USE_PENTIUM4_INSTRS)
AO_INLINE void
AO_nop_full(void)
{
  __asm__ __volatile__("mfence" : : : "memory");
}

#define AO_HAVE_nop_full

#else

/* We could use the cpuid instruction.  But that seems to be slower     */
/* than the default implementation based on test_and_set_full.  Thus    */
/* we omit that bit of misinformation here.                             */

#endif

/* As far as we can tell, the lfence and sfence instructions are not    */
/* currently needed or useful for cached memory accesses.               */

/* Really only works for 486 and later */
AO_INLINE AO_t
AO_fetch_and_add_full (volatile AO_t *p, AO_t incr)
{
  AO_t result;

  __asm__ __volatile__ ("lock; xaddl %0, %1" :
                        "=r" (result), "=m" (*p) : "0" (incr) /* , "m" (*p) */
                        : "memory");
  return result;
}

#define AO_HAVE_fetch_and_add_full

AO_INLINE unsigned char
AO_char_fetch_and_add_full (volatile unsigned char *p, unsigned char incr)
{
  unsigned char result;

  __asm__ __volatile__ ("lock; xaddb %0, %1" :
                        "=q" (result), "=m" (*p) : "0" (incr) /* , "m" (*p) */
                        : "memory");
  return result;
}

#define AO_HAVE_char_fetch_and_add_full

AO_INLINE unsigned short
AO_short_fetch_and_add_full (volatile unsigned short *p, unsigned short incr)
{
  unsigned short result;

  __asm__ __volatile__ ("lock; xaddw %0, %1" :
                        "=r" (result), "=m" (*p) : "0" (incr) /* , "m" (*p) */
                        : "memory");
  return result;
}

#define AO_HAVE_short_fetch_and_add_full

/* Really only works for 486 and later */
AO_INLINE void
AO_or_full (volatile AO_t *p, AO_t incr)
{
  __asm__ __volatile__ ("lock; orl %1, %0" :
                        "=m" (*p) : "r" (incr) /* , "m" (*p) */
                        : "memory");
}

#define AO_HAVE_or_full

AO_INLINE AO_TS_VAL_t
AO_test_and_set_full(volatile AO_TS_t *addr)
{
  AO_TS_t oldval;
  /* Note: the "xchg" instruction does not need a "lock" prefix */
  __asm__ __volatile__("xchg %0, %1"
                : "=q"(oldval), "=m"(*addr)
                : "0"(0xff) /* , "m"(*addr) */
                : "memory");
  return (AO_TS_VAL_t)oldval;
}

#define AO_HAVE_test_and_set_full

/* Returns nonzero if the comparison succeeded. */
AO_INLINE int
AO_compare_and_swap_full(volatile AO_t *addr,
                             AO_t old, AO_t new_val)
{
  char result;
  __asm__ __volatile__("lock; cmpxchgl %2, %0; setz %1"
                       : "=m"(*addr), "=a"(result)
                       : "r" (new_val), "a"(old) : "memory");
  return (int) result;
}

#define AO_HAVE_compare_and_swap_full

#if 0
/* FIXME: not tested (and probably wrong). Besides,     */
/* it tickles a bug in Sun C 5.10 (when optimizing).    */
/* Returns nonzero if the comparison succeeded. */
/* Really requires at least a Pentium.          */
AO_INLINE int
AO_compare_double_and_swap_double_full(volatile AO_double_t *addr,
                                       AO_t old_val1, AO_t old_val2,
                                       AO_t new_val1, AO_t new_val2)
{
  char result;
#if __PIC__
  /* If PIC is turned on, we can't use %ebx as it is reserved for the
     GOT pointer.  We can save and restore %ebx because GCC won't be
     using it for anything else (such as any of the m operands) */
  __asm__ __volatile__("pushl %%ebx;"   /* save ebx used for PIC GOT ptr */
                       "movl %6,%%ebx;" /* move new_val2 to %ebx */
                       "lock; cmpxchg8b %0; setz %1;"
                       "pop %%ebx;"     /* restore %ebx */
                       : "=m"(*addr), "=a"(result)
                       : "m"(*addr), "d" (old_val2), "a" (old_val1),
                         "c" (new_val2), "m" (new_val1) : "memory");
#else
  /* We can't just do the same thing in non-PIC mode, because GCC
   * might be using %ebx as the memory operand.  We could have ifdef'd
   * in a clobber, but there's no point doing the push/pop if we don't
   * have to. */
  __asm__ __volatile__("lock; cmpxchg8b %0; setz %1;"
                       : "=m"(*addr), "=a"(result)
                       : /* "m"(*addr), */ "d" (old_val2), "a" (old_val1),
                         "c" (new_val2), "b" (new_val1) : "memory");
#endif
  return (int) result;
}

#define AO_HAVE_compare_double_and_swap_double_full
#endif

#include "../ao_t_is_int.h"


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/sunc/x86_64.h
================================================
/*
 * Copyright (c) 1991-1994 by Xerox Corporation.  All rights reserved.
 * Copyright (c) 1996-1999 by Silicon Graphics.  All rights reserved.
 * Copyright (c) 1999-2003 by Hewlett-Packard Company. All rights reserved.
 *
 *
 * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
 * OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
 *
 * Permission is hereby granted to use or copy this program
 * for any purpose,  provided the above notices are retained on all copies.
 * Permission to modify the code and to distribute modified code is granted,
 * provided the above notices are retained, and a notice that the code was
 * modified is included with the above copyright notice.
 *
 * Some of the machine specific code was borrowed from our GC distribution.
 */

#include "../all_aligned_atomic_load_store.h"

/* Real X86 implementations, appear                                     */
/* to enforce ordering between memory operations, EXCEPT that a later   */
/* read can pass earlier writes, presumably due to the visible          */
/* presence of store buffers.                                           */
/* We ignore the fact that the official specs                           */
/* seem to be much weaker (and arguably too weak to be usable).         */

#include "../ordered_except_wr.h"

#include "../test_and_set_t_is_char.h"

#include "../standard_ao_double_t.h"

AO_INLINE void
AO_nop_full(void)
{
  /* Note: "mfence" (SSE2) is supported on all x86_64/amd64 chips.      */
  __asm__ __volatile__("mfence" : : : "memory");
}

#define AO_HAVE_nop_full

/* As far as we can tell, the lfence and sfence instructions are not    */
/* currently needed or useful for cached memory accesses.               */

AO_INLINE AO_t
AO_fetch_and_add_full (volatile AO_t *p, AO_t incr)
{
  AO_t result;

  __asm__ __volatile__ ("lock; xaddq %0, %1" :
                        "=r" (result), "=m" (*p) : "0" (incr) /* , "m" (*p) */
                        : "memory");
  return result;
}

#define AO_HAVE_fetch_and_add_full

AO_INLINE unsigned char
AO_char_fetch_and_add_full (volatile unsigned char *p, unsigned char incr)
{
  unsigned char result;

  __asm__ __volatile__ ("lock; xaddb %0, %1" :
                        "=q" (result), "=m" (*p) : "0" (incr) /* , "m" (*p) */
                        : "memory");
  return result;
}

#define AO_HAVE_char_fetch_and_add_full

AO_INLINE unsigned short
AO_short_fetch_and_add_full (volatile unsigned short *p, unsigned short incr)
{
  unsigned short result;

  __asm__ __volatile__ ("lock; xaddw %0, %1" :
                        "=r" (result), "=m" (*p) : "0" (incr) /* , "m" (*p) */
                        : "memory");
  return result;
}

#define AO_HAVE_short_fetch_and_add_full

AO_INLINE unsigned int
AO_int_fetch_and_add_full (volatile unsigned int *p, unsigned int incr)
{
  unsigned int result;

  __asm__ __volatile__ ("lock; xaddl %0, %1" :
                        "=r" (result), "=m" (*p) : "0" (incr) /* , "m" (*p) */
                        : "memory");
  return result;
}

#define AO_HAVE_int_fetch_and_add_full

AO_INLINE void
AO_or_full (volatile AO_t *p, AO_t incr)
{
  __asm__ __volatile__ ("lock; orq %1, %0" :
                        "=m" (*p) : "r" (incr) /* , "m" (*p) */
                        : "memory");
}

#define AO_HAVE_or_full

AO_INLINE AO_TS_VAL_t
AO_test_and_set_full(volatile AO_TS_t *addr)
{
  AO_TS_t oldval;
  /* Note: the "xchg" instruction does not need a "lock" prefix */
  __asm__ __volatile__("xchg %0, %1"
                : "=q"(oldval), "=m"(*addr)
                : "0"(0xff) /* , "m"(*addr) */
                : "memory");
  return (AO_TS_VAL_t)oldval;
}

#define AO_HAVE_test_and_set_full

/* Returns nonzero if the comparison succeeded. */
AO_INLINE int
AO_compare_and_swap_full(volatile AO_t *addr,
                         AO_t old, AO_t new_val)
{
  char result;
  __asm__ __volatile__("lock; cmpxchgq %2, %0; setz %1"
                       : "=m"(*addr), "=a"(result)
                       : "r" (new_val), "a"(old) : "memory");
  return (int) result;
}

#define AO_HAVE_compare_and_swap_full

#ifdef AO_CMPXCHG16B_AVAILABLE
/* NEC LE-IT: older AMD Opterons are missing this instruction.
 * On these machines SIGILL will be thrown.
 * Define AO_WEAK_DOUBLE_CAS_EMULATION to have an emulated
 * (lock based) version available */
/* HB: Changed this to not define either by default.  There are
 * enough machines and tool chains around on which cmpxchg16b
 * doesn't work.  And the emulation is unsafe by our usual rules.
 * Hoewever both are clearly useful in certain cases.
 */
AO_INLINE int
AO_compare_double_and_swap_double_full(volatile AO_double_t *addr,
                                       AO_t old_val1, AO_t old_val2,
                                       AO_t new_val1, AO_t new_val2)
{
  char result;
  __asm__ __volatile__("lock; cmpxchg16b %0; setz %1"
                       : "=m"(*addr), "=a"(result)
                       : "m"(*addr), "d" (old_val2), "a" (old_val1),
                         "c" (new_val2), "b" (new_val1) : "memory");
  return (int) result;
}
#define AO_HAVE_compare_double_and_swap_double_full
#else
/* this one provides spinlock based emulation of CAS implemented in     */
/* atomic_ops.c.  We probably do not want to do this here, since it is  */
/* not atomic with respect to other kinds of updates of *addr.  On the  */
/* other hand, this may be a useful facility on occasion.               */
#ifdef AO_WEAK_DOUBLE_CAS_EMULATION
int AO_compare_double_and_swap_double_emulation(volatile AO_double_t *addr,
                                                AO_t old_val1, AO_t old_val2,
                                                AO_t new_val1, AO_t new_val2);

AO_INLINE int
AO_compare_double_and_swap_double_full(volatile AO_double_t *addr,
                                       AO_t old_val1, AO_t old_val2,
                                       AO_t new_val1, AO_t new_val2)
{
        return AO_compare_double_and_swap_double_emulation(addr,
                                                           old_val1, old_val2,
                                                           new_val1, new_val2);
}
#define AO_HAVE_compare_double_and_swap_double_full
#endif /* AO_WEAK_DOUBLE_CAS_EMULATION */
#endif /* AO_CMPXCHG16B_AVAILABLE */


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/test_and_set_t_is_ao_t.h
================================================
/*
 * Copyright (c) 2004 Hewlett-Packard Development Company, L.P.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

/*
 * These are common definitions for architectures on which test_and_set
 * operates on pointer-sized quantities, the "clear" value contains
 * all zeroes, and the "set" value contains only one lowest bit set.
 * This can be used if test_and_set is synthesized from compare_and_swap.
 */
typedef enum {AO_TS_clear = 0, AO_TS_set = 1} AO_TS_val;
#define AO_TS_VAL_t AO_TS_val
#define AO_TS_CLEAR AO_TS_clear
#define AO_TS_SET AO_TS_set

#define AO_TS_t AO_t

#define AO_AO_TS_T 1


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops/sysdeps/test_and_set_t_is_char.h
================================================
/*
 * Copyright (c) 2004 Hewlett-Packard Development Company, L.P.
 * 
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE. 
 */ 

/*
 * These are common definitions for architectures on which test_and_set
 * operates on byte sized quantities, the "clear" value contains
 * all zeroes, and the "set" value contains all ones.
 */

#define AO_TS_t unsigned char
typedef enum {AO_BYTE_TS_clear = 0, AO_BYTE_TS_set = 0xff} AO_BYTE_TS_val;
#define AO_TS_VAL_t AO_BYTE_TS_val
#define AO_TS_CLEAR AO_BYTE_TS_clear
#define AO_TS_SET AO_BYTE_TS_set

#define AO_CHAR_TS_T 1


================================================
FILE: datastructures/trevor_brown_abtree/common/atomic_ops/atomic_ops.h
================================================
/*
 * Copyright (c) 2003 Hewlett-Packard Development Company, L.P.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#ifndef ATOMIC_OPS_H

#define ATOMIC_OPS_H

#include <assert.h>
#include <stddef.h>

/* We define various atomic operations on memory in a           */
/* machine-specific way.  Unfortunately, this is complicated    */
/* by the fact that these may or may not be combined with       */
/* various memory barriers.  Thus the actual operations we      */
/* define have the form AO_<atomic-op>_<barrier>, for all       */
/* plausible combinations of <atomic-op> and <barrier>.         */
/* This of course results in a mild combinatorial explosion.    */
/* To deal with it, we try to generate derived                  */
/* definitions for as many of the combinations as we can, as    */
/* automatically as possible.                                   */
/*                                                              */
/* Our assumption throughout is that the programmer will        */
/* specify the least demanding operation and memory barrier     */
/* that will guarantee correctness for the implementation.      */
/* Our job is to find the least expensive way to implement it   */
/* on the applicable hardware.  In many cases that will         */
/* involve, for example, a stronger memory barrier, or a        */
/* combination of hardware primitives.                          */
/*                                                              */
/* Conventions:                                                 */
/* "plain" atomic operations are not guaranteed to include      */
/* a barrier.  The suffix in the name specifies the barrier     */
/* type.  Suffixes are:                                         */
/* _release: Earlier operations may not be delayed past it.     */
/* _acquire: Later operations may not move ahead of it.         */
/* _read: Subsequent reads must follow this operation and       */
/*        preceding reads.                                      */
/* _write: Earlier writes precede both this operation and       */
/*        later writes.                                         */
/* _full: Ordered with respect to both earlier and later memops.*/
/* _release_write: Ordered with respect to earlier writes.      */
/* _acquire_read: Ordered with respect to later reads.          */
/*                                                              */
/* Currently we try to define the following atomic memory       */
/* operations, in combination with the above barriers:          */
/* AO_nop                                                       */
/* AO_load                                                      */
/* AO_store                                                     */
/* AO_test_and_set (binary)                                     */
/* AO_fetch_and_add                                             */
/* AO_fetch_and_add1                                            */
/* AO_fetch_and_sub1                                            */
/* AO_or                                                        */
/* AO_compare_and_swap                                          */
/*                                                              */
/* Note that atomicity guarantees are valid only if both        */
/* readers and writers use AO_ operations to access the         */
/* shared value, while ordering constraints are intended to     */
/* apply all memory operations.  If a location can potentially  */
/* be accessed simultaneously from multiple threads, and one of */
/* those accesses may be a write access, then all such          */
/* accesses to that location should be through AO_ primitives.  */
/* However if AO_ operations enforce sufficient ordering to     */
/* ensure that a location x cannot be accessed concurrently,    */
/* or can only be read concurrently, then x can be accessed     */
/* via ordinary references and assignments.                     */
/*                                                              */
/* Compare_and_exchange takes an address and an expected old    */
/* value and a new value, and returns an int.  Nonzero          */
/* indicates that it succeeded.                                 */
/* Test_and_set takes an address, atomically replaces it by     */
/* AO_TS_SET, and returns the prior value.                      */
/* An AO_TS_t location can be reset with the                    */
/* AO_CLEAR macro, which normally uses AO_store_release.        */
/* AO_fetch_and_add takes an address and an AO_t increment      */
/* value.  The AO_fetch_and_add1 and AO_fetch_and_sub1 variants */
/* are provided, since they allow faster implementations on     */
/* some hardware. AO_or atomically ors an AO_t value into a     */
/* memory location, but does not provide access to the original.*/
/*                                                              */
/* We expect this list to grow slowly over time.                */
/*                                                              */
/* Note that AO_nop_full is a full memory barrier.              */
/*                                                              */
/* Note that if some data is initialized with                   */
/*      data.x = ...; data.y = ...; ...                         */
/*      AO_store_release_write(&data_is_initialized, 1)         */
/* then data is guaranteed to be initialized after the test     */
/*      if (AO_load_release_read(&data_is_initialized)) ...     */
/* succeeds.  Furthermore, this should generate near-optimal    */
/* code on all common platforms.                                */
/*                                                              */
/* All operations operate on unsigned AO_t, which               */
/* is the natural word size, and usually unsigned long.         */
/* It is possible to check whether a particular operation op    */
/* is available on a particular platform by checking whether    */
/* AO_HAVE_op is defined.  We make heavy use of these macros    */
/* internally.                                                  */

/* The rest of this file basically has three sections:          */
/*                                                              */
/* Some utility and default definitions.                        */
/*                                                              */
/* The architecture dependent section:                          */
/* This defines atomic operations that have direct hardware     */
/* support on a particular platform, mostly by including the    */
/* appropriate compiler- and hardware-dependent file.           */
/*                                                              */
/* The synthesis section:                                       */
/* This tries to define other atomic operations in terms of     */
/* those that are explicitly available on the platform.         */
/* This section is hardware independent.                        */
/* We make no attempt to synthesize operations in ways that     */
/* effectively introduce locks, except for the debugging/demo   */
/* pthread-based implementation at the beginning.  A more       */
/* realistic implementation that falls back to locks could be   */
/* added as a higher layer.  But that would sacrifice           */
/* usability from signal handlers.                              */
/* The synthesis section is implemented almost entirely in      */
/* atomic_ops_generalize.h.                                     */

/* Some common defaults.  Overridden for some architectures.    */
#define AO_t size_t

/* The test_and_set primitive returns an AO_TS_VAL_t value.     */
/* AO_TS_t is the type of an in-memory test-and-set location.   */

#define AO_TS_INITIALIZER (AO_t)AO_TS_CLEAR

/* Platform-dependent stuff:                                    */
#if defined(__GNUC__) || defined(_MSC_VER) || defined(__INTEL_COMPILER) \
        || defined(__DMC__) || defined(__WATCOMC__)
# define AO_INLINE static __inline
#elif defined(__sun)
# define AO_INLINE static inline
#else
# define AO_INLINE static
#endif

#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
# define AO_compiler_barrier() __asm__ __volatile__("" : : : "memory")
#elif defined(_MSC_VER) || defined(__DMC__) || defined(__BORLANDC__) \
        || defined(__WATCOMC__)
# if defined(_AMD64_) || defined(_M_X64) || _MSC_VER >= 1400
#   if defined(_WIN32_WCE)
/* #     include <cmnintrin.h> */
#   elif defined(_MSC_VER)
#     include <intrin.h>
#   endif
#   pragma intrinsic(_ReadWriteBarrier)
#   define AO_compiler_barrier() _ReadWriteBarrier()
        /* We assume this does not generate a fence instruction.        */
        /* The documentation is a bit unclear.                          */
# else
#   define AO_compiler_barrier() __asm { }
        /* The preceding implementation may be preferable here too.     */
        /* But the documentation warns about VC++ 2003 and earlier.     */
# endif
#elif defined(__INTEL_COMPILER)
# define AO_compiler_barrier() __memory_barrier() /* Too strong? IA64-only? */
#elif defined(_HPUX_SOURCE)
# if defined(__ia64)
#   include <machine/sys/inline.h>
#   define AO_compiler_barrier() _Asm_sched_fence()
# else
    /* FIXME - We dont know how to do this.  This is a guess.   */
    /* And probably a bad one.                                  */
    static volatile int AO_barrier_dummy;
#   define AO_compiler_barrier() AO_barrier_dummy = AO_barrier_dummy
# endif
#else
  /* We conjecture that the following usually gives us the right        */
  /* semantics or an error.                                             */
# define AO_compiler_barrier() asm("")
#endif

#if defined(AO_USE_PTHREAD_DEFS)
# include "atomic_ops/sysdeps/generic_pthread.h"
#endif /* AO_USE_PTHREAD_DEFS */

#if (defined(__CC_ARM) || defined(__ARMCC__)) && !defined(__GNUC__) \
    && !defined(AO_USE_PTHREAD_DEFS)
# include "atomic_ops/sysdeps/armcc/arm_v6.h"
# define AO_GENERALIZE_TWICE
#endif

#if defined(__GNUC__) && !defined(AO_USE_PTHREAD_DEFS) \
    && !defined(__INTEL_COMPILER)
# if defined(__i386__)
    /* We don't define AO_USE_SYNC_CAS_BUILTIN for x86 here because     */
    /* it might require specifying additional options (like -march)     */
    /* or additional link libraries (if -march is not specified).       */
#   include "atomic_ops/sysdeps/gcc/x86.h"
# endif /* __i386__ */
# if defined(__x86_64__)
#   if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2)
      /* It is safe to use __sync CAS built-in on this architecture.    */
#     define AO_USE_SYNC_CAS_BUILTIN
#   endif
#   include "atomic_ops/sysdeps/gcc/x86_64.h"
# endif /* __x86_64__ */
# if defined(__ia64__)
#   include "atomic_ops/sysdeps/gcc/ia64.h"
#   define AO_GENERALIZE_TWICE
# endif /* __ia64__ */
# if defined(__hppa__)
#   include "atomic_ops/sysdeps/gcc/hppa.h"
#   define AO_CAN_EMUL_CAS
# endif /* __hppa__ */
# if defined(__alpha__)
#   include "atomic_ops/sysdeps/gcc/alpha.h"
#   define AO_GENERALIZE_TWICE
# endif /* __alpha__ */
# if defined(__s390__)
#   include "atomic_ops/sysdeps/gcc/s390.h"
# endif /* __s390__ */
# if defined(__sparc__)
#   include "atomic_ops/sysdeps/gcc/sparc.h"
#   define AO_CAN_EMUL_CAS
# endif /* __sparc__ */
# if defined(__m68k__)
#   include "atomic_ops/sysdeps/gcc/m68k.h"
# endif /* __m68k__ */
# if defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) \
     || defined(__powerpc64__) || defined(__ppc64__)
#   include "atomic_ops/sysdeps/gcc/powerpc.h"
# endif /* __powerpc__ */
# if defined(__arm__) && !defined(AO_USE_PTHREAD_DEFS)
#   include "atomic_ops/sysdeps/gcc/arm.h"
#   define AO_CAN_EMUL_CAS
# endif /* __arm__ */
# if defined(__cris__) || defined(CRIS)
#   include "atomic_ops/sysdeps/gcc/cris.h"
#   define AO_GENERALIZE_TWICE
# endif
# if defined(__mips__)
#   include "atomic_ops/sysdeps/gcc/mips.h"
# endif /* __mips__ */
# if defined(__sh__) || defined(SH4)
#   include "atomic_ops/sysdeps/gcc/sh.h"
#   define AO_CAN_EMUL_CAS
# endif /* __sh__ */
# if defined(__avr32__)
#   include "atomic_ops/sysdeps/gcc/avr32.h"
# endif
#endif /* __GNUC__ && !AO_USE_PTHREAD_DEFS */

#if (defined(__IBMC__) || defined(__IBMCPP__)) && !defined(__GNUC__) \
    && !defined(AO_USE_PTHREAD_DEFS)
# if defined(__powerpc__) || defined(__powerpc) || defined(__ppc__) \
     || defined(__PPC__) || defined(_M_PPC) || defined(_ARCH_PPC) \
     || defined(_ARCH_PWR)
#   include "atomic_ops/sysdeps/ibmc/powerpc.h"
#   define AO_GENERALIZE_TWICE
# endif
#endif

#if defined(__INTEL_COMPILER) && !defined(AO_USE_PTHREAD_DEFS)
# if defined(__ia64__)
#   include "atomic_ops/sysdeps/icc/ia64.h"
#   define AO_GENERALIZE_TWICE
# endif
# if defined(__GNUC__)
    /* Intel Compiler in GCC compatible mode */
#   if defined(__i386__)
#     include "atomic_ops/sysdeps/gcc/x86.h"
#   endif /* __i386__ */
#   if defined(__x86_64__)
#     if __INTEL_COMPILER > 1110
#       define AO_USE_SYNC_CAS_BUILTIN
#     endif
#     include "atomic_ops/sysdeps/gcc/x86_64.h"
#   endif /* __x86_64__ */
# endif
#endif

#if defined(_HPUX_SOURCE) && !defined(__GNUC__) && !defined(AO_USE_PTHREAD_DEFS)
# if defined(__ia64)
#   include "atomic_ops/sysdeps/hpc/ia64.h"
#   define AO_GENERALIZE_TWICE
# else
#   include "atomic_ops/sysdeps/hpc/hppa.h"
#   define AO_CAN_EMUL_CAS
# endif
#endif

#if defined(_MSC_VER) || defined(__DMC__) || defined(__BORLANDC__) \
        || (defined(__WATCOMC__) && defined(__NT__))
# if defined(_AMD64_) || defined(_M_X64)
#   include "atomic_ops/sysdeps/msftc/x86_64.h"
# elif defined(_M_IX86) || defined(x86)
#   include "atomic_ops/sysdeps/msftc/x86.h"
# elif defined(_M_ARM) || defined(ARM) || defined(_ARM_)
#   include "atomic_ops/sysdeps/msftc/arm.h"
#   define AO_GENERALIZE_TWICE
# endif
#endif

#if defined(__sun) && !defined(__GNUC__) && !defined(AO_USE_PTHREAD_DEFS)
  /* Note: use -DAO_USE_PTHREAD_DEFS if Sun CC does not handle inline asm. */
# if defined(__i386)
#   include "atomic_ops/sysdeps/sunc/x86.h"
# endif /* __i386 */
# if defined(__x86_64) || defined(__amd64)
#   include "atomic_ops/sysdeps/sunc/x86_64.h"
# endif /* __x86_64 */
#endif

#if !defined(__GNUC__) && (defined(sparc) || defined(__sparc)) \
    && !defined(AO_USE_PTHREAD_DEFS)
# include "atomic_ops/sysdeps/sunc/sparc.h"
# define AO_CAN_EMUL_CAS
#endif

#if defined(AO_REQUIRE_CAS) && !defined(AO_HAVE_compare_and_swap) \
    && !defined(AO_HAVE_compare_and_swap_full) \
    && !defined(AO_HAVE_compare_and_swap_acquire)
# if defined(AO_CAN_EMUL_CAS)
#   include "atomic_ops/sysdeps/emul_cas.h"
# else
#  error Cannot implement AO_compare_and_swap_full on this architecture.
# endif
#endif /* AO_REQUIRE_CAS && !AO_HAVE_compare_and_swap ... */

/* The most common way to clear a test-and-set location         */
/* at the end of a critical section.                            */
#if AO_AO_TS_T && !defined(AO_CLEAR)
# define AO_CLEAR(addr) AO_store_release((AO_TS_t *)(addr), AO_TS_CLEAR)
#endif
#if AO_CHAR_TS_T && !defined(AO_CLEAR)
# define AO_CLEAR(addr) AO_char_store_release((AO_TS_t *)(addr), AO_TS_CLEAR)
#endif

/* The generalization section.  */
#if !defined(AO_GENERALIZE_TWICE) && defined(AO_CAN_EMUL_CAS) \
    && !defined(AO_HAVE_compare_and_swap_full)
# define AO_GENERALIZE_TWICE
#endif

/* Theoretically we should repeatedly include atomic_ops_generalize.h.  */
/* In fact, we observe that this converges after a small fixed number   */
/* of iterations, usually one.                                          */
#include "atomic_ops/generalize.h"
#ifdef AO_GENERALIZE_TWICE
# include "atomic_ops/generalize.h"
#endif

/* For compatibility with version 0.4 and earlier       */
#define AO_TS_T AO_TS_t
#define AO_T AO_t
#define AO_TS_VAL AO_TS_VAL_t

#endif /* ATOMIC_OPS_H */


================================================
FILE: datastructures/trevor_brown_abtree/common/dcss/dcss_plus.h
================================================
/* 
 * File:   dcss_plus.h
 * Author: Maya Arbel-Raviv
 *
 * Created on May 1, 2017, 10:42 AM
 */

#ifndef DCSS_PLUS_H
#define DCSS_PLUS_H

#include <cstdarg>
#include <csignal>
#include <string.h>
#include "descriptors.h"

#define dcssptagptr_t uintptr_t
#define dcsspptr_t dcsspdesc_t<PAYLOAD_T> *
#define casword_t intptr_t

#define DCSSP_STATE_UNDECIDED 0
#define DCSSP_STATE_SUCCEEDED 4
#define DCSSP_STATE_FAILED 8

#define DCSSP_LEFTSHIFT 1

#define DCSSP_IGNORED_RETVAL -1
#define DCSSP_SUCCESS 0
#define DCSSP_FAILED_ADDR1 1 
#define DCSSP_FAILED_ADDR2 2

#define MAX_PAYLOAD_PTRS 6

struct dcsspresult_t {
    int status;
    casword_t failed_val;
};

template <typename PAYLOAD_T>
class dcsspdesc_t {
public:
    volatile mutables_t mutables;
    casword_t volatile * volatile addr1;
    casword_t volatile old1;
    casword_t volatile * volatile addr2;
    casword_t volatile old2;
    casword_t volatile new2;
    PAYLOAD_T volatile payload1[MAX_PAYLOAD_PTRS+1];
    PAYLOAD_T volatile payload2[MAX_PAYLOAD_PTRS+1];
    const static int size = sizeof(mutables)+sizeof(addr1)+sizeof(old1)+sizeof(addr2)+sizeof(old2)+sizeof(new2)+sizeof(PAYLOAD_T)*(MAX_PAYLOAD_PTRS+1)+sizeof(PAYLOAD_T)*(MAX_PAYLOAD_PTRS+1);
    char padding[PREFETCH_SIZE_BYTES+(((64<<10)-size%64)%64)]; // add padding to prevent false sharing
} __attribute__ ((aligned(64)));

template <typename PAYLOAD_T>
class dcsspProvider {
    /**
     * Data definitions
     */
private:
    // descriptor reduction algorithm
    #define DCSSP_MUTABLES_OFFSET_STATE 0
    #define DCSSP_MUTABLES_MASK_STATE 0xf
    #define DCSSP_MUTABLES_NEW(mutables) \
        ((((mutables)&MASK_SEQ)+(1<<OFFSET_SEQ)) \
        | (DCSSP_STATE_UNDECIDED<<DCSSP_MUTABLES_OFFSET_STATE))
    #include "descriptors_impl2.h"
    char __padding_desc[PREFETCH_SIZE_BYTES];
    dcsspdesc_t<PAYLOAD_T> dcsspDescriptors[LAST_TID+1] __attribute__ ((aligned(64)));
    char __padding_desc3[PREFETCH_SIZE_BYTES];

public:
#ifdef USE_DEBUGCOUNTERS
    debugCounter * dcsspHelpCounter;
#endif
    const int NUM_PROCESSES;
    
    /**
     * Function declarations
     */
    dcsspProvider(const int numProcesses);
    ~dcsspProvider();
    void initThread(const int tid);
    void deinitThread(const int tid);
    void writePtr(casword_t volatile * addr, casword_t val);        // use for addresses that might have been modified by DCSSP (ONLY GOOD FOR INITIALIZING, CANNOT DEAL WITH CONCURRENT DCSSP OPERATIONS.)
    void writeVal(casword_t volatile * addr, casword_t val);        // use for addresses that might have been modified by DCSSP (ONLY GOOD FOR INITIALIZING, CANNOT DEAL WITH CONCURRENT DCSSP OPERATIONS.)
    casword_t readPtr(const int tid, casword_t volatile * addr);    // use for addresses that might have been modified by DCSSP
    casword_t readVal(const int tid, casword_t volatile * addr);    // use for addresses that might have been modified by DCSSP
    inline dcsspresult_t dcsspPtr(const int tid, casword_t * addr1, casword_t old1, casword_t * addr2, casword_t old2, casword_t new2, PAYLOAD_T * const payload1, PAYLOAD_T * const payload2); // use when addr2 is a pointer, or another type that does not use its least significant bit
    inline dcsspresult_t dcsspVal(const int tid, casword_t * addr1, casword_t old1, casword_t * addr2, casword_t old2, casword_t new2, PAYLOAD_T * const payload1, PAYLOAD_T * const payload2); // use when addr2 uses its least significant bit, but does not use its most significant but
    void discardPayloads(const int tid);
    void debugPrint();
    
    tagptr_t getDescriptorTagptr(const int otherTid);
    dcsspptr_t getDescriptorPtr(tagptr_t tagptr);
    bool getDescriptorSnapshot(tagptr_t tagptr, dcsspptr_t const dest);
    void helpProcess(const int tid, const int otherTid);
private:
    casword_t dcsspRead(const int tid, casword_t volatile * addr);
    inline dcsspresult_t dcsspHelp(const int tid, dcssptagptr_t tagptr, dcsspptr_t snapshot, bool helpingOther);
    void dcsspHelpOther(const int tid, dcssptagptr_t tagptr);
};

#endif /* DCSS_PLUS_H */


================================================
FILE: datastructures/trevor_brown_abtree/common/dcss/dcss_plus_impl.h
================================================
/* 
 * File:   dcss_plus_impl.h
 * Author: Maya Arbel-Raviv
 *
 * Created on May 1, 2017, 10:52 AM
 */

#ifndef DCSS_PLUS_IMPL_H
#define DCSS_PLUS_IMPL_H

#include "dcss_plus.h"
#include <cassert>
#include <stdint.h>
#include <sstream>
using namespace std;

#define BOOL_CAS __sync_bool_compare_and_swap
#define VAL_CAS __sync_val_compare_and_swap

#define DCSSP_TAGBIT 0x1

static bool isDcssp(casword_t val) {
    return (val & DCSSP_TAGBIT);
}

template <typename PAYLOAD_T>
dcsspresult_t dcsspProvider<PAYLOAD_T>::dcsspHelp(const int tid, dcssptagptr_t tagptr, dcsspptr_t snapshot, bool helpingOther) {
    // figure out what the state should be
    casword_t state = DCSSP_STATE_FAILED;

    SOFTWARE_BARRIER;
    casword_t val1 = *(snapshot->addr1);
    SOFTWARE_BARRIER;
    
    //DELAY_UP_TO(1000);
    if (val1 == snapshot->old1) { // linearize here(?)
        state = DCSSP_STATE_SUCCEEDED;
    }
    
    // try to cas the state to the appropriate value
    dcsspptr_t ptr = TAGPTR_UNPACK_PTR(dcsspDescriptors,tagptr);
    casword_t retval;
    bool failedBit;
    MUTABLES_VAL_CAS_FIELD(failedBit, retval, ptr->mutables, snapshot->mutables, DCSSP_STATE_UNDECIDED, state, DCSSP_MUTABLES_MASK_STATE, DCSSP_MUTABLES_OFFSET_STATE); 
    if (failedBit) return {DCSSP_IGNORED_RETVAL,0};                             // failed to access the descriptor: we must be helping another process complete its operation, so we will NOT use this return value!
    
    // TODO: do we do the announcement here? what will be announced exactly? do we let the user provide a pointer/value to announce as an argument to dcssp? do we need to provide an operation to retrieve the current announcement for a given process?
    
    // finish the operation based on the descriptor's state
    if ((retval == DCSSP_STATE_UNDECIDED && state == DCSSP_STATE_SUCCEEDED)     // if we changed the state to succeeded OR
      || retval == DCSSP_STATE_SUCCEEDED) {                                     // if someone else changed the state to succeeded
//        if (state == DCSSP_STATE_FAILED) DELAY_UP_TO(1000);
        assert(helpingOther || ((snapshot->mutables & DCSSP_MUTABLES_MASK_STATE) >> DCSSP_MUTABLES_OFFSET_STATE) == DCSSP_STATE_SUCCEEDED);
        BOOL_CAS(snapshot->addr2, (casword_t) tagptr, snapshot->new2); 
        return {DCSSP_SUCCESS,0};
    } else {                                                                    // either we or someone else changed the state to failed
        assert((retval == DCSSP_STATE_UNDECIDED && state == DCSSP_STATE_FAILED)
                || retval == DCSSP_STATE_FAILED);
        assert(helpingOther || ((snapshot->mutables & DCSSP_MUTABLES_MASK_STATE) >> DCSSP_MUTABLES_OFFSET_STATE) == DCSSP_STATE_FAILED);
        BOOL_CAS(snapshot->addr2, (casword_t) tagptr, snapshot->old2);
//        if (state == DCSSP_STATE_FAILED) DELAY_UP_TO(1000);
        return {DCSSP_FAILED_ADDR1,val1};
    }
}

template <typename PAYLOAD_T>
void dcsspProvider<PAYLOAD_T>::dcsspHelpOther(const int tid, dcssptagptr_t tagptr) {
    const int otherTid = TAGPTR_UNPACK_TID(tagptr);
    assert(otherTid >= 0 && otherTid < NUM_PROCESSES);
    dcsspdesc_t<PAYLOAD_T> newSnapshot;
    const int sz = dcsspdesc_t<PAYLOAD_T>::size;
    assert((((tagptr & MASK_SEQ) >> OFFSET_SEQ) & 1) == 1);
    if (DESC_SNAPSHOT(dcsspdesc_t<PAYLOAD_T>, dcsspDescriptors, &newSnapshot, tagptr, sz)) {
        dcsspHelp(tid, tagptr, &newSnapshot, true);
    } else {
        //TRACE COUTATOMICTID("helpOther unable to get snapshot of "<<tagptrToString(tagptr)<<endl);
    }
}

template <typename PAYLOAD_T>
inline
tagptr_t dcsspProvider<PAYLOAD_T>::getDescriptorTagptr(const int otherTid) {
    dcsspptr_t ptr = &dcsspDescriptors[otherTid];
    tagptr_t tagptr = TAGPTR_NEW(otherTid, ptr->mutables, DCSSP_TAGBIT);
    if ((UNPACK_SEQ(tagptr) & 1) == 0) {
        // descriptor is being initialized! essentially,
        // we can think of there being NO ongoing operation,
        // so we can imagine we return NULL = no descriptor.
        return (tagptr_t) NULL;
    }
    return tagptr;
}

template <typename PAYLOAD_T>
inline
dcsspptr_t dcsspProvider<PAYLOAD_T>::getDescriptorPtr(tagptr_t tagptr) {
    return TAGPTR_UNPACK_PTR(dcsspDescriptors, tagptr);
}

template <typename PAYLOAD_T>
inline
bool dcsspProvider<PAYLOAD_T>::getDescriptorSnapshot(tagptr_t tagptr, dcsspptr_t const dest) {
    if (tagptr == (tagptr_t) NULL) return false;
    return DESC_SNAPSHOT(dcsspdesc_t<PAYLOAD_T>, dcsspDescriptors, dest, tagptr, dcsspdesc_t<PAYLOAD_T>::size);
}

template <typename PAYLOAD_T>
inline
void dcsspProvider<PAYLOAD_T>::helpProcess(const int tid, const int otherTid) {
    tagptr_t tagptr = getDescriptorTagptr(otherTid);
    if (tagptr != (tagptr_t) NULL) dcsspHelpOther(tid, tagptr);
}

template <typename PAYLOAD_T>
void dcsspProvider<PAYLOAD_T>::discardPayloads(const int tid) {
    SOFTWARE_BARRIER;
    dcssptagptr_t tagptr = getDescriptorTagptr(tid);
    dcsspptr_t ptr = getDescriptorPtr(tagptr);
    ptr->payload1[0] = NULL;
    ptr->payload2[0] = NULL;
    SOFTWARE_BARRIER;
}

template <typename PAYLOAD_T>
dcsspresult_t dcsspProvider<PAYLOAD_T>::dcsspVal(const int tid, casword_t * addr1, casword_t old1, casword_t * addr2, casword_t old2, casword_t new2, PAYLOAD_T * const payload1, PAYLOAD_T * const payload2) {
    return dcsspPtr(tid, addr1, old1, addr2, old2 << DCSSP_LEFTSHIFT , new2 << DCSSP_LEFTSHIFT, payload1, payload2);
}

template <typename PAYLOAD_T>
dcsspresult_t dcsspProvider<PAYLOAD_T>::dcsspPtr(const int tid, casword_t * addr1, casword_t old1, casword_t * addr2, casword_t old2, casword_t new2, PAYLOAD_T * const payload1, PAYLOAD_T * const payload2) {
    // create dcssp descriptor
    dcsspptr_t ptr = DESC_NEW(dcsspDescriptors, DCSSP_MUTABLES_NEW, tid);
    assert((((dcsspDescriptors[tid].mutables & MASK_SEQ) >> OFFSET_SEQ) & 1) == 0);
    ptr->addr1 = addr1;
    ptr->old1 = old1;
    ptr->addr2 = addr2;
    ptr->old2 = old2;
    ptr->new2 = new2;
    
    // add payload1 and payload2 to the dcssp descriptor
    int i;
    for (i=0;payload1[i];++i) {
        ptr->payload1[i] = payload1[i];
        assert(i < MAX_PAYLOAD_PTRS);
    }
    ptr->payload1[i] = NULL;
    for (i=0;payload2[i];++i) {
        ptr->payload2[i] = payload2[i];
        assert(i < MAX_PAYLOAD_PTRS);
    }
    ptr->payload2[i] = NULL;
    DESC_INITIALIZED(dcsspDescriptors, tid);
    
    // create tagptr
    assert((((dcsspDescriptors[tid].mutables & MASK_SEQ) >> OFFSET_SEQ) & 1) == 1);
    tagptr_t tagptr = TAGPTR_NEW(tid, ptr->mutables, DCSSP_TAGBIT);
    
    // perform the dcssp operation described by our descriptor
    casword_t r;
    do {
        assert(!isDcssp(ptr->old2));
        assert(isDcssp(tagptr));
        r = VAL_CAS(ptr->addr2, ptr->old2, (casword_t) tagptr);
        if (isDcssp(r)) {
#ifdef USE_DEBUGCOUNTERS
            this->dcsspHelpCounter->inc(tid);
#endif
            dcsspHelpOther(tid, (dcssptagptr_t) r);
        }
    } while (isDcssp(r));
    if (r == ptr->old2){
//        DELAY_UP_TO(1000);
        return dcsspHelp(tid, tagptr, ptr, false); // finish our own operation      
    } 
    return {DCSSP_FAILED_ADDR2,r};//DCSSP_FAILED_ADDR2;
}

template <typename PAYLOAD_T>
casword_t dcsspProvider<PAYLOAD_T>::dcsspRead(const int tid, casword_t volatile * addr) {
    casword_t r;
    while (1) {
        r = *addr;
        if (isDcssp(r)) {
#ifdef USE_DEBUGCOUNTERS
            this->dcsspHelpCounter->inc(tid);
#endif
            dcsspHelpOther(tid, (dcssptagptr_t) r);
        } else {
            return r;
        }
    }
}

template <typename PAYLOAD_T>
dcsspProvider<PAYLOAD_T>::dcsspProvider(const int numProcesses) : NUM_PROCESSES(numProcesses) {
#ifdef USE_DEBUGCOUNTERS
    dcsspHelpCounter = new debugCounter(NUM_PROCESSES);
#endif
    DESC_INIT_ALL(dcsspDescriptors, DCSSP_MUTABLES_NEW, NUM_PROCESSES);
    for (int tid=0;tid<numProcesses;++tid) {
        dcsspDescriptors[tid].addr1 = 0;
        dcsspDescriptors[tid].addr2 = 0;
        dcsspDescriptors[tid].new2 = 0;
        dcsspDescriptors[tid].old1 = 0;
        dcsspDescriptors[tid].old2 = 0;
        dcsspDescriptors[tid].payload1[0] = NULL;
        dcsspDescriptors[tid].payload2[0] = NULL;
    }
}

template <typename PAYLOAD_T>
dcsspProvider<PAYLOAD_T>::~dcsspProvider() {
#ifdef USE_DEBUGCOUNTERS
    delete dcsspHelpCounter;
#endif
}

template <typename PAYLOAD_T>
casword_t dcsspProvider<PAYLOAD_T>::readPtr(const int tid, casword_t volatile * addr) {
    casword_t r;
    r = dcsspRead(tid, addr);
    return r;
}

template <typename PAYLOAD_T>
casword_t dcsspProvider<PAYLOAD_T>::readVal(const int tid, casword_t volatile * addr) {
    return ((casword_t) readPtr(tid, addr))>>DCSSP_LEFTSHIFT;
}

template <typename PAYLOAD_T>
void dcsspProvider<PAYLOAD_T>::writePtr(casword_t volatile * addr, casword_t ptr) {
    //assert((*addr & DCSSP_TAGBIT) == 0);
    assert((ptr & DCSSP_TAGBIT) == 0);
    *addr = ptr;
}

template <typename PAYLOAD_T>
void dcsspProvider<PAYLOAD_T>::writeVal(casword_t volatile * addr, casword_t val) {
    writePtr(addr, val<<DCSSP_LEFTSHIFT);
}

template <typename PAYLOAD_T>
void dcsspProvider<PAYLOAD_T>::initThread(const int tid) {}

template <typename PAYLOAD_T>
void dcsspProvider<PAYLOAD_T>::deinitThread(const int tid) {}

template <typename PAYLOAD_T>
void dcsspProvider<PAYLOAD_T>::debugPrint() {
#ifdef USE_DEBUGCOUNTERS
    cout<<"dcssp helping : "<<this->dcsspHelpCounter->getTotal()<<endl;
#endif
}

#endif /* DCSS_PLUS_IMPL_H */


================================================
FILE: datastructures/trevor_brown_abtree/common/dcss/testing.cpp
================================================
#include <cstdlib>
#include <cmath>
#include <iostream>
#include "dcss_plus_impl.h"

using namespace std;

#define NUM_OPS 10000000
#define INCREMENT 1

#define FALSE_SHARING_ULL_FACTOR 24
#define FALSE_SHARING_PAD_BYTES 192

#define COUNTER(tid) (counters[(tid)*FALSE_SHARING_ULL_FACTOR])

int numProcesses = 0;
volatile unsigned long long counters[MAX_TID_POW2*FALSE_SHARING_ULL_FACTOR];
volatile char padding[FALSE_SHARING_PAD_BYTES];
volatile unsigned long long faa;

volatile bool start;
volatile int running; // number of threads that are running
dcsspProvider * prov;

#ifndef KERNEL
#define KERNEL test_kernel1
#endif

#ifndef VALIDATE
#define VALIDATE validate1
#endif

//#define GET_FAA_FOR_TID(tid) ((faa >> ((tid)*(62/numProcesses))) & (numProcesses == 1 ? 0xffffffffffffffffULL : ((1ULL<<(62/numProcesses))-1)))

void * test_kernel1(void * arg) {
    const int tid = *((int *) arg);
    //const unsigned long long numOps = min(1ULL<<20, 1ULL<<(62/numProcesses)-1);
    //const unsigned long long increment = 1ULL<<(tid*(62/numProcesses));
    prov->initThread(tid);
    __sync_fetch_and_add(&running, 1);
    while (!start) { __sync_synchronize(); }
    
    //COUTATOMICTID("performing "<<numOps<<" operations"<<endl);
    int numSucc = 0;
    while (numSucc < NUM_OPS) {
#if 1
        void * deletedNodes[] = {NULL};
        casword_t oldval = (casword_t) prov->readVal(tid,(casword_t*)&(COUNTER(tid)));
        casword_t newval = (casword_t) oldval+1;
        if (DCSSP_SUCCESS == prov->dcsspVal(tid, (casword_t *) &faa, (casword_t) faa, (casword_t *) &COUNTER(tid), oldval, newval, deletedNodes)) {
            ++numSucc;
            __sync_fetch_and_add(&faa, INCREMENT);
        }
#else
        ++numSucc;
        ++COUNTER(tid);
        __sync_fetch_and_add(&faa, INCREMENT);
#endif
    }
    
    prov->deinitThread(tid);
}

bool validate1() {
    // compute checksum
    bool good = true;
    for (int i=0;i<numProcesses;++i) {
        unsigned long long c = prov->readVal(i,(casword_t*)&(COUNTER(i)));
        if (c != NUM_OPS) {
            cout<<"ERROR: counters["<<i<<"]="<<c<<" does not match NUM_OPS="<<NUM_OPS<<endl;
            good = false;
        } else {
            cout<<"thread "<<i<<": counter="<<c<<" NUM_OPS="<<NUM_OPS<<endl;
        }

//        if (c != GET_FAA_FOR_TID(i)) {
//            cout<<"ERROR: counters["<<i<<"]="<<c<<" does not match FAA subword="<<(GET_FAA_FOR_TID(i))<<endl;
//            good = false;
//        }
//        cout<<"thread "<<i<<": counter="<<c<<" faa="<<(GET_FAA_FOR_TID(i))<<endl;
    }
    
    const unsigned long long f = faa;
    if (f != NUM_OPS * numProcesses) {
        cout<<"ERROR: faa="<<f<<" does not match NUM_OPS*numProcesses="<<(NUM_OPS*numProcesses)<<endl;
        good = false;
    } else {
        cout<<"faa="<<f<<" and NUM_OPS*numProcesses="<<(NUM_OPS*numProcesses)<<endl;
    }
    return good;
}

void * test_kernel2(void * arg) {
    const int tid = *((int *) arg);
    prov->initThread(tid);
    __sync_fetch_and_add(&running, 1);
    while (!start) { __sync_synchronize(); }
    
    //COUTATOMICTID("performing "<<numOps<<" operations"<<endl);
    int numSucc = 0;
    while (numSucc < NUM_OPS) {
        void * deletedNodes[] = {NULL};
        casword_t old1 = (casword_t) COUNTER((tid+1)%numProcesses);
        casword_t old2 = (casword_t) prov->readVal(tid, (casword_t *) &faa);
        casword_t new2 = (casword_t) old2+1;
        if (DCSSP_SUCCESS == prov->dcsspVal(tid, (casword_t *) &COUNTER((tid+1)%numProcesses), old1, (casword_t *) &faa, old2, new2, deletedNodes)) {
            ++numSucc;
            ++COUNTER(tid);
        }
    }
    prov->deinitThread(tid);
}

bool validate2() {
    // compute checksum
    bool good = true;
    for (int i=0;i<numProcesses;++i) {
        unsigned long long c = COUNTER(i);
        if (c != NUM_OPS) {
            cout<<"ERROR: counters["<<i<<"]="<<c<<" does not match NUM_OPS="<<NUM_OPS<<endl;
            good = false;
        } else {
            cout<<"thread "<<i<<": counter="<<c<<" NUM_OPS="<<NUM_OPS<<endl;
        }
    }
    
    const int tid = 0;
    const unsigned long long f = prov->readVal(tid, (casword_t *) &faa);
    if (f != NUM_OPS * numProcesses) {
        cout<<"ERROR: faa="<<f<<" does not match NUM_OPS*numProcesses="<<(NUM_OPS*numProcesses)<<endl;
        good = false;
    } else {
        cout<<"faa="<<f<<" and NUM_OPS*numProcesses="<<(NUM_OPS*numProcesses)<<endl;
    }
    return good;
}

int main(int argc, char** argv) {
    if (argc != 2) {
        cout<<"Usage: "<<argv[0]<<" NUM_THREADS"<<endl;
        exit(-1);
    }
    numProcesses = atoi(argv[1]);
    
    // create threads
    const int tid = 0; // dummy tid for main thread
    pthread_t *threads[numProcesses];
    int ids[numProcesses];
    for (int i=0;i<numProcesses;++i) {
        threads[i] = new pthread_t;
        ids[i] = i;
        COUNTER(i) = 0;
    }
    
    // init data structure
    faa = 0;
    for (int i=0;i<numProcesses;++i) {
        COUNTER(i) = 0;
    }
    prov = new dcsspProvider(numProcesses);

    // start all threads
    running = 0;
    start = false;
    __sync_synchronize();
    for (int i=0;i<numProcesses;++i) {
        if (pthread_create(threads[i], NULL, KERNEL, &ids[i])) {
            cerr<<"ERROR: could not create thread"<<endl;
            exit(-1);
        }
    }
    while (running < numProcesses) {
        TRACE COUTATOMIC("main thread: waiting for threads to START running="<<running<<endl);
        __sync_synchronize();
    } // wait for all threads to be ready
    COUTATOMIC("main thread: starting trial..."<<endl);
    __sync_synchronize();
    start = true;
    __sync_synchronize();
    
    // join all threads
    for (int i=0;i<numProcesses;++i) {
//        COUTATOMIC("joining thread "<<i<<endl);
        if (pthread_join(*(threads[i]), NULL)) {
            cerr<<"ERROR: could not join thread"<<endl;
            exit(-1);
        }
    }
    
    if (VALIDATE()) {
        COUTATOMIC("main thread: "<<"All tests passed."<<endl);
    } else {
        COUTATOMIC("main thread: "<<"ERROR occurred."<<endl);
    }
    
    delete prov;
    
    return 0;
}


================================================
FILE: datastructures/trevor_brown_abtree/common/descriptors/descriptors.h
================================================
/* 
 * File:   descriptors.h
 * Author: tabrown
 *
 * Created on June 8, 2016, 6:18 PM
 */

#ifndef DESCRIPTORS_H
#define	DESCRIPTORS_H

typedef intptr_t tagptr_t;
typedef intptr_t mutables_t;

#endif	/* DESCRIPTORS_H */


================================================
FILE: datastructures/trevor_brown_abtree/common/descriptors/descriptors_impl.h
================================================
/* 
 * File:   descriptors_impl.h
 * Author: tabrown
 *
 * Created on June 9, 2016, 4:04 PM
 */

#ifndef DESCRIPTORS_IMPL_H
#define	DESCRIPTORS_IMPL_H

#include "descriptors.h"

#if !defined(DESC1_T) || !defined(DESC1_ARRAY) || !defined(MUTABLES1_NEW)
#error "Must define DESC1_T, DESC1_ARRAY, MUTABLES1_NEW before including descriptors_impl.h"
#endif

#define DESC1_INIT_ALL(numProcesses) { \
    for (int i=0;i<(numProcesses);++i) { \
        DESC1_ARRAY[i].c.mutables = MUTABLES1_NEW(0); \
    } \
}

/**
 * pointer to descriptor (listed least to most significant):
 * 1-bit: is descriptor
 * 10-bits: thread id
 * remaining bits: sequence number (for avoiding the ABA problem)
 */

/**
 * mutables_t corresponds to the mutables field of the descriptor.
 * it contains the mutable fields of the descriptor and a sequence number.
 * the width, offset and mask for the sequence number is defined below.
 * this sequence number width, offset and mask are also shared by tagptr_t.
 *
 * in particular, for any tagptr_t x and mutables_t y, the sequence numbers
 * in x and y are equal iff x&MASK1_SEQ == y&MASK1_SEQ (despite the differing
 * types of x and y).
 * 
 * tagptr_t consists of a triple <seq, tid, testbit>.
 * these three fields are defined by the TAGPTR_ macros below.
 */

#ifndef WIDTH1_SEQ
    #define WIDTH1_SEQ 48
#endif
#define OFFSET1_SEQ 11
#define MASK1_SEQ ((uintptr_t)((1LL<<WIDTH1_SEQ)-1)<<OFFSET1_SEQ) /* cast to avoid signed bit shifting */
#define UNPACK1_SEQ(tagptrOrMutables) (((uintptr_t)(tagptrOrMutables))>>OFFSET1_SEQ)

#define TAGPTR1_OFFSET_STALE 0 /* UNUSED */
#define TAGPTR1_OFFSET_TID 1
#define TAGPTR1_MASK_STALE 0x1 /* UNUSED */
#define TAGPTR1_MASK_TID (((1<<OFFSET1_SEQ)-1)&(~(TAGPTR1_MASK_STALE)))
#define TAGPTR1_STALE(tagptr) (((tagptr_t) (tagptr)) & TAGPTR1_MASK_STALE) /* UNUSED */
#define TAGPTR1_UNPACK_TID(tagptr) ((int) ((((tagptr_t) (tagptr))&TAGPTR1_MASK_TID)>>TAGPTR1_OFFSET_TID))
#define TAGPTR1_UNPACK_PTR(tagptr) (&DESC1_ARRAY[TAGPTR1_UNPACK_TID((tagptr))])
#define TAGPTR1_NEW(tid, mutables) ((tagptr_t) (((UNPACK1_SEQ(mutables))<<OFFSET1_SEQ) | ((tid)<<TAGPTR1_OFFSET_TID)))
// assert: there is no thread with tid DUMMY_TID that ever calls TAGPTR1_NEW
#define LAST_TID1 (TAGPTR1_MASK_TID>>TAGPTR1_OFFSET_TID)
#define TAGPTR1_STATIC_DESC(id) ((tagptr_t) TAGPTR1_NEW(LAST_TID1-1-id, 0))
#define TAGPTR1_DUMMY_DESC(id) ((tagptr_t) TAGPTR1_NEW(LAST_TID1, id<<OFFSET1_SEQ))

#define comma1 ,

#define MUTABLES1_UNPACK_FIELD(mutables, mask, offset) \
    ((((mutables_t) (mutables))&(mask))>>(offset))
#define MUTABLES1_WRITE_FIELD(fldMutables, snapMutables, val, mask, offset) { \
    mutables_t __v = (fldMutables); \
    while (UNPACK1_SEQ(__v) == UNPACK1_SEQ((snapMutables)) \
            && MUTABLES1_UNPACK_FIELD(__v, (mask), (offset)) != (val) \
            && !__sync_bool_compare_and_swap(&(fldMutables), __v, \
                    (__v & ~(mask)) | ((val)<<(offset)))) { \
        __v = (fldMutables); \
    } \
}
#define MUTABLES1_WRITE_BIT(fldMutables, snapMutables, mask) { \
    mutables_t __v = (fldMutables); \
    while (UNPACK1_SEQ(__v) == UNPACK1_SEQ((snapMutables)) \
            && !(__v&(mask)) \
            && !__sync_bool_compare_and_swap(&(fldMutables), __v, (__v|(mask)))) { \
        __v = (fldMutables); \
    } \
}

// WARNING: uses a GCC extension "({ })". to get rid of this, use an inline function.
#define DESC1_SNAPSHOT(descDest, tagptr, sz) ({ \
    DESC1_T *__src = TAGPTR1_UNPACK_PTR((tagptr)); \
    memcpy((descDest), __src, (sz)); \
    SOFTWARE_BARRIER; /* prevent compiler from reordering read of __src->mutables before (at least the reading portion of) the memcpy */ \
    (UNPACK1_SEQ(__src->c.mutables) == UNPACK1_SEQ((tagptr))); \
})
#define DESC1_READ_FIELD(successBit, fldMutables, tagptr, mask, offset) ({ \
    mutables_t __mutables = (fldMutables); \
    successBit = (UNPACK1_SEQ(__mutables) == UNPACK1_SEQ(tagptr)); \
    MUTABLES1_UNPACK_FIELD(__mutables, (mask), (offset)); \
})
#define DESC1_NEW(tid) &DESC1_ARRAY[(tid)]; { /* note: only the process invoking this following macro can change the sequence# */ \
    SOFTWARE_BARRIER; \
    uintptr_t __v = DESC1_ARRAY[(tid)].c.mutables; \
/*    while (!__sync_bool_compare_and_swap(&DESC1_ARRAY[(tid)].mutables, __v, MUTABLES1_NEW(__v))) { \
        __v = DESC1_ARRAY[(tid)].mutables; \
    } \
}*/ \
    DESC1_ARRAY[(tid)].c.mutables = MUTABLES1_NEW(__v); \
    /*__sync_synchronize();*/ \
    SOFTWARE_BARRIER; \
}
#define DESC1_INITIALIZED(tid) \
    SOFTWARE_BARRIER; \
    DESC1_ARRAY[(tid)].c.mutables += (1<<OFFSET1_SEQ); /*DESC1_NEW((tid))*/ \
    SOFTWARE_BARRIER;

#endif	/* DESCRIPTORS_IMPL_H */


================================================
FILE: datastructures/trevor_brown_abtree/common/descriptors/descriptors_impl2.h
================================================
/* 
 * File:   descriptors_impl2.h
 * Author: tabrown
 *
 * Created on June 30, 2016, 11:53 AM
 */

#ifndef DESCRIPTORS_IMPL2_H
#define	DESCRIPTORS_IMPL2_H

#include "descriptors.h"

#define DESC_INIT_ALL(descArray, macro_mutablesNew, numProcesses) { \
    for (int i=0;i<(numProcesses);++i) { \
        (descArray)[i].mutables = macro_mutablesNew(0); \
    } \
}

/**
 * mutables_t corresponds to the mutables field of the descriptor.
 * it contains the mutable fields of the descriptor and a sequence number.
 * the width, offset and mask for the sequence number is defined below.
 * this sequence number width, offset and mask are also shared by tagptr_t.
 *
 * in particular, for any tagptr_t x and mutables_t y, the sequence numbers
 * in x and y are equal iff x&MASK_SEQ == y&MASK_SEQ (despite the differing
 * types of x and y).
 * 
 * tagptr_t consists of a triple <seq, tid, testbit>.
 * these three fields are defined by the TAGPTR_ macros below.
 */

#ifndef WIDTH_SEQ
    #define WIDTH_SEQ 48
#endif
#define OFFSET_SEQ 14
#define MASK_SEQ ((uintptr_t)((1LL<<WIDTH_SEQ)-1)<<OFFSET_SEQ) /* cast to avoid signed bit shifting */
#define UNPACK_SEQ(tagptrOrMutables) (((uintptr_t)(tagptrOrMutables))>>OFFSET_SEQ)

#define TAGPTR_OFFSET_USER 0
#define TAGPTR_OFFSET_TID 3
#define TAGPTR_MASK_USER ((1<<TAGPTR_OFFSET_TID)-1) /* assumes TID is next field after USER */
#define TAGPTR_MASK_TID (((1<<OFFSET_SEQ)-1)&(~(TAGPTR_MASK_USER)))
#define TAGPTR_UNPACK_TID(tagptr) ((int) ((((tagptr_t) (tagptr))&TAGPTR_MASK_TID)>>TAGPTR_OFFSET_TID))
#define TAGPTR_UNPACK_PTR(descArray, tagptr) (&(descArray)[TAGPTR_UNPACK_TID((tagptr))])
#define TAGPTR_NEW(tid, mutables, userBits) ((tagptr_t) (((UNPACK_SEQ(mutables))<<OFFSET_SEQ) | ((tid)<<TAGPTR_OFFSET_TID) | (tagptr_t) (userBits)<<TAGPTR_OFFSET_USER))
// assert: there is no thread with tid DUMMY_TID that ever calls TAGPTR_NEW
#define LAST_TID (TAGPTR_MASK_TID>>TAGPTR_OFFSET_TID)
#define TAGPTR_STATIC_DESC(id) ((tagptr_t) TAGPTR_NEW(LAST_TID-1-id, 0))
#define TAGPTR_DUMMY_DESC(id) ((tagptr_t) TAGPTR_NEW(LAST_TID, id<<OFFSET_SEQ))

#define comma ,

#define MUTABLES_UNPACK_FIELD(mutables, mask, offset) \
    ((((mutables_t) (mutables))&(mask))>>(offset))
// TODO: make more efficient version "MUTABLES_CAS_BIT"
// TODO: change sequence # unpacking to masking for quick comparison
// note: if there is only one subfield besides seq#, then the third if-block is redundant, and you should just return false if the cas fails, since the only way the cas fails and the field being cas'd contains still old is if the sequence number has changed.
#define MUTABLES_BOOL_CAS_FIELD(successBit, fldMutables, snapMutables, oldval, val, mask, offset) { \
    mutables_t __v = (fldMutables); \
    while (1) { \
        if (UNPACK_SEQ(__v) != UNPACK_SEQ((snapMutables))) { \
            (successBit) = false; \
            break; \
        } \
        if ((successBit) = __sync_bool_compare_and_swap(&(fldMutables), \
                (__v & ~(mask)) | ((oldval)<<(offset)), \
                (__v & ~(mask)) | ((val)<<(offset)))) { \
            break; \
        } \
        __v = (fldMutables); \
        if (MUTABLES_UNPACK_FIELD(__v, (mask), (offset)) != (oldval)) { \
            (successBit) = false; \
            break; \
        } \
    } \
}

#define MUTABLES_VAL_CAS_FIELD(failedBit, retval, fldMutables, snapMutables, oldval, val, mask, offset) { \
    mutables_t __v = (fldMutables); \
    while (1) { \
        if (UNPACK_SEQ(__v) != UNPACK_SEQ((snapMutables))) { \
            (failedBit) = true; /* version number has changed, CAS cannot occur */ \
            break; \
        } \
        mutables_t __oldval = (__v & ~(mask)) | ((oldval)<<(offset)); \
        (retval) = __sync_val_compare_and_swap(&(fldMutables), \
                __oldval, \
                (__v & ~(mask)) | ((val)<<(offset))); \
        if ((retval) == __oldval) { /* CAS SUCCESS */ \
            (retval) = MUTABLES_UNPACK_FIELD((retval), (mask), (offset)); /* return contents of subfield */ \
            (failedBit) = false; \
            break; \
        } else { /* CAS FAILURE: should we retry? */ \
            __v = (retval); /* save the value that caused our CAS to fail, in case we need to retry */ \
            (retval) = MUTABLES_UNPACK_FIELD((retval), (mask), (offset)); /* return contents of subfield */ \
            if ((retval) != (oldval)) { /* check if we failed because the subfield's contents do not match oldval */ \
                (failedBit) = false; \
                break; \
            } \
            /* subfield's contents DO match oldval, so we need to try again */ \
        } \
    } \
}

// TODO: change sequence # unpacking to masking for quick comparison
// note: MUTABLES_FAA_FIELD would be very similar to MUTABLES_BOOL_CAS_FIELD; i think one would simply delete the last if block and change the new val from (val)<<offset to (val&mask)+1.
#define MUTABLES_WRITE_FIELD(fldMutables, snapMutables, val, mask, offset) { \
    mutables_t __v = (fldMutables); \
    while (UNPACK_SEQ(__v) == UNPACK_SEQ((snapMutables)) \
            && MUTABLES_UNPACK_FIELD(__v, (mask), (offset)) != (val) \
            && !__sync_bool_compare_and_swap(&(fldMutables), __v, \
                    (__v & ~(mask)) | ((val)<<(offset)))) { \
        __v = (fldMutables); \
    } \
}
#define MUTABLES_WRITE_BIT(fldMutables, snapMutables, mask) { \
    mutables_t __v = (fldMutables); \
    while (UNPACK_SEQ(__v) == UNPACK_SEQ((snapMutables)) \
            && !(__v&(mask)) \
            && !__sync_bool_compare_and_swap(&(fldMutables), __v, (__v|(mask)))) { \
        __v = (fldMutables); \
    } \
}

// WARNING: uses a GCC extension "({ })". to get rid of this, use an inline function.
#define DESC_SNAPSHOT(descType, descArray, descDest, tagptr, sz) ({ \
    descType *__src = TAGPTR_UNPACK_PTR((descArray), (tagptr)); \
    memcpy((descDest), __src, (sz)); \
    SOFTWARE_BARRIER; /* prevent compiler from reordering read of __src->mutables before (at least the reading portion of) the memcpy */ \
    (UNPACK_SEQ(__src->mutables) == UNPACK_SEQ((tagptr))); \
})
#define DESC_READ_FIELD(successBit, fldMutables, tagptr, mask, offset) ({ \
    mutables_t __mutables = (fldMutables); \
    successBit = (__mutables & MASK_SEQ) == ((tagptr) & MASK_SEQ); \
    MUTABLES_UNPACK_FIELD(__mutables, (mask), (offset)); \
})
#define DESC_NEW(descArray, macro_mutablesNew, tid) &(descArray)[(tid)]; { /* note: only the process invoking this following macro can change the sequence# */ \
    SOFTWARE_BARRIER; \
    mutables_t __v = (descArray)[(tid)].mutables; \
    (descArray)[(tid)].mutables = macro_mutablesNew(__v); \
    SOFTWARE_BARRIER; \
    /*__sync_synchronize();*/ \
}
#define DESC_INITIALIZED(descArray, tid) \
    SOFTWARE_BARRIER; \
    (descArray)[(tid)].mutables += (1<<OFFSET_SEQ); \
    SOFTWARE_BARRIER;

#endif	/* DESCRIPTORS_IMPL2_H */


================================================
FILE: datastructures/trevor_brown_abtree/common/errors.h
================================================
/* 
 * C++ record manager implementation (PODC 2015) by Trevor Brown.
 *
 * Created on April 20, 2017, 1:09 PM
 */

#ifndef ERRORS_H
#define	ERRORS_H

#include <iostream>
#include <string>
#include <unistd.h>

#ifndef error
#define error(s) { \
    std::cout<<"ERROR: "<<s<<" (at "<<__FILE__<<"::"<<__FUNCTION__<<":"<<__LINE__<<")"<<std::endl; \
    exit(-1); \
}
#endif

//__attribute__((always_inline))
//void error(std::string s) {
//    std::cout<<"ERROR: "<<s<<" (at "<<__FILE__<<"::"<<__FUNCTION__<<":"<<__LINE__<<")"<<std::endl;
//    exit(-1);
//}

#endif	/* ERRORS_H */


================================================
FILE: datastructures/trevor_brown_abtree/common/plaf.h
================================================
/**
 * C++ record manager implementation (PODC 2015) by Trevor Brown.
 * 
 * Copyright (C) 2015 Trevor Brown
 *
 */

#ifndef MACHINECONSTANTS_H
#define	MACHINECONSTANTS_H

#ifndef MAX_TID_POW2
    #define MAX_TID_POW2 128 // MUST BE A POWER OF TWO, since this is used for some bitwise operations
#endif
#ifndef PHYSICAL_PROCESSORS
    #define PHYSICAL_PROCESSORS 128
#endif

// the following definition is only used to pad data to avoid false sharing.
// although the number of words per cache line is actually 8, we inflate this
// figure to counteract the effects of prefetching multiple adjacent cache lines.
#define PREFETCH_SIZE_WORDS 24
#define PREFETCH_SIZE_BYTES 192
#define BYTES_IN_CACHE_LINE 64

#endif	/* MACHINECONSTANTS_H */


================================================
FILE: datastructures/trevor_brown_abtree/common/recordmgr/allocator_bump.h
================================================
/**
 * C++ record manager implementation (PODC 2015) by Trevor Brown.
 * 
 * Copyright (C) 2015 Trevor Brown
 *
 */

#ifndef ALLOC_BUMP_H
#define	ALLOC_BUMP_H

#include "plaf.h"
#include "globals.h"
#include "allocator_interface.h"
#include <cstdlib>
#include <cassert>
#include <iostream>
#include <vector>
using namespace std;

template<typename T = void>
class allocator_bump : public allocator_interface<T> {
    private:
        const int cachelines;    // # cachelines needed to store an object of type T
        // for bump allocation from a contiguous chunk of memory
        T ** mem;             // mem[tid*PREFETCH_SIZE_WORDS] = pointer to current array to perform bump allocation from
        int * memBytes;       // memBytes[tid*PREFETCH_SIZE_WORDS] = size of mem in bytes
        T ** current;         // current[tid*PREFETCH_SIZE_WORDS] = pointer to current position in array mem
        vector<T*> ** toFree; // toFree[tid] = pointer to vector of bump allocation arrays to free when this allocator is destroyed

        T* bump_memory_next(const int tid) {
            T* result = current[tid*PREFETCH_SIZE_WORDS];
            current[tid*PREFETCH_SIZE_WORDS] = (T*) (((char*) current[tid*PREFETCH_SIZE_WORDS]) + (cachelines*BYTES_IN_CACHE_LINE));
            return result;
        }
        int bump_memory_bytes_remaining(const int tid) {
            return (((char*) mem[tid*PREFETCH_SIZE_WORDS])+memBytes[tid*PREFETCH_SIZE_WORDS]) - ((char*) current[tid*PREFETCH_SIZE_WORDS]);
        }
        bool bump_memory_full(const int tid) {
            return (((char*) current[tid*PREFETCH_SIZE_WORDS])+cachelines*BYTES_IN_CACHE_LINE > ((char*) mem[tid*PREFETCH_SIZE_WORDS])+memBytes[tid*PREFETCH_SIZE_WORDS]);
        }
        // call this when mem is null, or doesn't contain enough space to allocate an object
        void bump_memory_allocate(const int tid) {
            mem[tid*PREFETCH_SIZE_WORDS] = (T*) malloc(1<<24);
            memBytes[tid*PREFETCH_SIZE_WORDS] = 1<<24;
            current[tid*PREFETCH_SIZE_WORDS] = mem[tid*PREFETCH_SIZE_WORDS];
            toFree[tid]->push_back(mem[tid*PREFETCH_SIZE_WORDS]); // remember we allocated this to free it later
#ifdef HAS_FUNCTION_aligned_alloc
#else
            // align on cacheline boundary
            int mod = (int) (((long) mem[tid*PREFETCH_SIZE_WORDS]) % BYTES_IN_CACHE_LINE);
            if (mod > 0) {
                // we are ignoring the first mod bytes of mem, because if we
                // use them, we will not be aligning objects to cache lines.
                current[tid*PREFETCH_SIZE_WORDS] = (T*) (((char*) mem[tid*PREFETCH_SIZE_WORDS]) + BYTES_IN_CACHE_LINE - mod);
            } else {
                current[tid*PREFETCH_SIZE_WORDS] = mem[tid*PREFETCH_SIZE_WORDS];
            }
#endif
            assert((((long) current[tid*PREFETCH_SIZE_WORDS]) % BYTES_IN_CACHE_LINE) == 0);
        }

    public:
        template<typename _Tp1>
        struct rebind {
            typedef allocator_bump<_Tp1> other;
        };

        // reserve space for ONE object of type T
        T* allocate(const int tid) {
            // bump-allocate from a contiguous chunk of memory
            if (!mem[tid*PREFETCH_SIZE_WORDS] || bump_memory_full(tid)) {
                bump_memory_allocate(tid);
                MEMORY_STATS {
                    this->debug->addAllocated(tid, memBytes[tid*PREFETCH_SIZE_WORDS] / cachelines / BYTES_IN_CACHE_LINE);
                    VERBOSE DEBUG2 {
//                        if ((this->debug->getAllocated(tid) % 2000) == 0) {
//                            this->debugInterfaces->reclaim->debugPrintStatus(tid);
//                            debugPrintStatus(tid);
                            COUTATOMICTID("allocated "<<(memBytes[tid*PREFETCH_SIZE_WORDS] / cachelines / BYTES_IN_CACHE_LINE)/*this->debug->getAllocated(tid)*/<<" records of size "<<sizeof(T)<<std::endl);
//                            COUTATOMIC(" ");
//                            this->pool->debugPrintStatus(tid);
//                            COUTATOMIC(endl);
//                        }
                    }
                }
            }
            return bump_memory_next(tid);
        }
        void static deallocate(const int tid, T * const p) {
            // no op for this allocator; memory is freed only by the destructor.
            // however, we have to call the destructor for the object manually...
            p->~T();
        }
        void deallocateAndClear(const int tid, blockbag<T> * const bag) {
            // the bag is cleared, which makes it seem like we're leaking memory,
            // but it will be freed in the destructor as we release the huge
            // slabs of memory.
            bag->clearWithoutFreeingElements();
        }

        void debugPrintStatus(const int tid) {}

        void initThread(const int tid) {}
        
        allocator_bump(const int numProcesses, debugInfo * const _debug)
                : allocator_interface<T>(numProcesses, _debug)
                , cachelines((sizeof(T)+(BYTES_IN_CACHE_LINE-1))/BYTES_IN_CACHE_LINE){
            VERBOSE DEBUG COUTATOMIC("constructor allocator_bump"<<std::endl);
            mem = new T*[numProcesses*PREFETCH_SIZE_WORDS];
            memBytes = new int[numProcesses*PREFETCH_SIZE_WORDS];
            current = new T*[numProcesses*PREFETCH_SIZE_WORDS];
            toFree = new vector<T*>*[numProcesses];
            for (int tid=0;tid<numProcesses;++tid) {
                mem[tid*PREFETCH_SIZE_WORDS] = 0;
                memBytes[tid*PREFETCH_SIZE_WORDS] = 0;
                current[tid*PREFETCH_SIZE_WORDS] = 0;
                toFree[tid] = new vector<T*>();
            }
        }
        ~allocator_bump() {
            VERBOSE COUTATOMIC("destructor allocator_bump"<<std::endl);
            // free all allocated blocks of memory
            for (int tid=0;tid<this->NUM_PROCESSES;++tid) {
                int n = toFree[tid]->size();
                for (int i=0;i<n;++i) {
                    free((*toFree[tid])[i]);
                }
                delete toFree[tid];
            }
            delete[] mem;
            delete[] memBytes;
            delete[] current;
            delete[] toFree;
        }
    };

#endif	/* ALLOC_NEW_H */


================================================
FILE: datastructures/trevor_brown_abtree/common/recordmgr/allocator_interface.h
================================================
/**
 * C++ record manager implementation (PODC 2015) by Trevor Brown.
 * 
 * Copyright (C) 2015 Trevor Brown
 *
 */

#ifndef ALLOC_INTERFACE_H
#define	ALLOC_INTERFACE_H

#include "debug_info.h"
#include "blockbag.h"
#include <iostream>
using namespace std;

template <typename T = void>
class allocator_interface {
public:
    debugInfo * const debug;
    
    const int NUM_PROCESSES;
    
    template<typename _Tp1>
    struct rebind {
        typedef allocator_interface<_Tp1> other;
    };
    
    // allocate space for one object of type T
    T* allocate(const int tid);
    void deallocate(const int tid, T * const p);
    void deallocateAndClear(const int tid, blockbag<T> * const bag);
    void initThread(const int tid);
    
    void debugPrintStatus(const int tid);

    allocator_interface(const int numProcesses, debugInfo * const _debug)
            : debug(_debug)
            , NUM_PROCESSES(numProcesses){
        VERBOSE DEBUG std::cout<<"constructor allocator_interface"<<std::endl;
    }
    ~allocator_interface() {
        VERBOSE DEBUG std::cout<<"destructor allocator_interface"<<std::endl;
    }
};

#endif


================================================
FILE: datastructures/trevor_brown_abtree/common/recordmgr/allocator_new.h
================================================
/**
 * C++ record manager implementation (PODC 2015) by Trevor Brown.
 * 
 * Copyright (C) 2015 Trevor Brown
 *
 */

#ifndef ALLOC_NEW_H
#define	ALLOC_NEW_H

#include "plaf.h"
#include "pool_interface.h"
#include <cstdlib>
#include <cassert>
#include <iostream>
using namespace std;

//__thread long long currentAllocatedBytes = 0;
//__thread long long maxAllocatedBytes = 0;

template<typename T = void>
class allocator_new : public allocator_interface<T> {
public:
    template<typename _Tp1>
    struct rebind {
        typedef allocator_new<_Tp1> other;
    };
    
    // reserve space for ONE object of type T
    T* allocate(const int tid) {
        // allocate a new object
        MEMORY_STATS {
            this->debug->addAllocated(tid, 1);
            VERBOSE {
                if ((this->debug->getAllocated(tid) % 2000) == 0) {
                    debugPrintStatus(tid);
                }
            }
//            currentAllocatedBytes += sizeof(T);
//            if (currentAllocatedBytes > maxAllocatedBytes) {
//                maxAllocatedBytes = currentAllocatedBytes;
//            }
        }
        return new T; //(T*) malloc(sizeof(T));
    }
    void deallocate(const int tid, T * const p) {
        // note: allocators perform the actual freeing/deleting, since
        // only they know how memory was allocated.
        // pools simply call deallocate() to request that it is freed.
        // allocators do not invoke pool functions.
        MEMORY_STATS {
            this->debug->addDeallocated(tid, 1);
//            currentAllocatedBytes -= sizeof(T);
        }
#if !defined NO_FREE
        delete p;
#endif
    }
    void deallocateAndClear(const int tid, blockbag<T> * const bag) {
#ifdef NO_FREE
        bag->clearWithoutFreeingElements();
#else
        while (!bag->isEmpty()) {
            T* ptr = bag->remove();
            deallocate(tid, ptr);
        }
#endif
    }
    
    void debugPrintStatus(const int tid) {
//        std::cout<</*"thread "<<tid<<" "<<*/"allocated "<<this->debug->getAllocated(tid)<<" objects of size "<<(sizeof(T));
//        std::cout<<" ";
////        this->pool->debugPrintStatus(tid);
//        std::cout<<std::endl;
    }
    
    void initThread(const int tid) {}
    
    allocator_new(const int numProcesses, debugInfo * const _debug)
            : allocator_interface<T>(numProcesses, _debug) {
        VERBOSE DEBUG std::cout<<"constructor allocator_new"<<std::endl;
    }
    ~allocator_new() {
        VERBOSE DEBUG std::cout<<"destructor allocator_new"<<std::endl;
    }
};

#endif	/* ALLOC_NEW_H */


================================================
FILE: datastructures/trevor_brown_abtree/common/recordmgr/allocator_new_segregated.h
================================================
/**
 * C++ record manager implementation (PODC 2015) by Trevor Brown.
 * 
 * Copyright (C) 2015 Trevor Brown
 *
 */

#ifndef ALLOC_NEW_SEGREGATED_H
#define	ALLOC_NEW_SEGREGATED_H

#include "plaf.h"
#include "pool_interface.h"
#include <cstdlib>
#include <cassert>
#include <iostream>
#include <dlfcn.h>
#include <pthread.h>
using namespace std;

//__thread long long currentAllocatedBytes = 0;
//__thread long long maxAllocatedBytes = 0;

template<typename T = void>
class allocator_new_segregated : public allocator_interface<T> {
private:
    void* (*allocfn)(size_t size);
    void (*freefn)(void *ptr);
    
public:
    template<typename _Tp1>
    struct rebind {
        typedef allocator_new_segregated<_Tp1> other;
    };
    
    // reserve space for ONE object of type T
    T* allocate(const int tid) {
        // allocate a new object
        MEMORY_STATS {
            this->debug->addAllocated(tid, 1);
            VERBOSE {
                if ((this->debug->getAllocated(tid) % 2000) == 0) {
                    debugPrintStatus(tid);
                }
            }
//            currentAllocatedBytes += sizeof(T);
//            if (currentAllocatedBytes > maxAllocatedBytes) {
//                maxAllocatedBytes = currentAllocatedBytes;
//            }
        }
        return (T*) allocfn(sizeof(T));
    }
    void deallocate(const int tid, T * const p) {
        // note: allocators perform the actual freeing/deleting, since
        // only they know how memory was allocated.
        // pools simply call deallocate() to request that it is freed.
        // allocators do not invoke pool functions.
        MEMORY_STATS {
            this->debug->addDeallocated(tid, 1);
//            currentAllocatedBytes -= sizeof(T);
        }
#if !defined NO_FREE
        p->~T(); // explicitly call destructor, since we lose automatic destructor calls when we bypass new/delete([])
        freefn(p);
#endif
    }
    void deallocateAndClear(const int tid, blockbag<T> * const bag) {
#if defined NO_FREE
        bag->clearWithoutFreeingElements();
#else
        while (!bag->isEmpty()) {
            T* ptr = bag->remove();
            deallocate(tid, ptr);
        }
#endif
    }
    
    void debugPrintStatus(const int tid) {}
    
    void initThread(const int tid) {}
    
    static void* dummy_thr(void *p) { return 0; }
    
    allocator_new_segregated(const int numProcesses, debugInfo * const _debug)
            : allocator_interface<T>(numProcesses, _debug) {
        VERBOSE DEBUG std::cout<<"constructor allocator_new_segregated"<<std::endl;
        
	char *lib = getenv("TREE_MALLOC");
	if (!lib) {
		printf("no TREE_MALLOC defined: using default!\n");
                allocfn = malloc;
                freefn = free;
		return;
	}
	void *h = dlopen(lib, RTLD_NOW | RTLD_LOCAL);
	if (!h) {
		fprintf(stderr, "unable to load '%s': %s\n", lib, dlerror());
		exit(1);
	}

	// If the allocator exports pthread_create(), we assume it does so to detect
	// multi-threading (through interposition on pthread_create()) and so call
	// this function (since it might not be called otherwise, if the standard
	// allocator does a similar trick).
	int (*pthread_create)(pthread_t *, const pthread_attr_t *, void *(*) (void *), void *);
	pthread_create = (__typeof(pthread_create)) dlsym(h, "pthread_create");
	if (pthread_create) {
		pthread_t thr;
		pthread_create(&thr, NULL, dummy_thr, NULL);
		pthread_join(thr, NULL);
	}
	allocfn = (__typeof(allocfn)) dlsym(h, "malloc");
	if (!allocfn) {
		fprintf(stderr, "unable to resolve malloc\n");
		exit(1);
	}
	freefn = (__typeof(freefn)) dlsym(h, "free");
	if (!freefn) {
		fprintf(stderr, "unable to resolve free\n");
		exit(1);
	}
    }
    ~allocator_new_segregated() {
        VERBOSE DEBUG std::cout<<"destructor allocator_new_segregated"<<std::endl;
    }
};

#endif	/* ALLOC_NEW_SEGREGATED_H */


================================================
FILE: datastructures/trevor_brown_abtree/common/recordmgr/allocator_once.h
================================================
/**
 * C++ record manager implementation (PODC 2015) by Trevor Brown.
 * 
 * Copyright (C) 2015 Trevor Brown
 *
 */

#ifndef ALLOC_ONCE_H
#define	ALLOC_ONCE_H

#include "plaf.h"
#include "globals.h"
#include "allocator_interface.h"
#include <cstdlib>
#include <cassert>
#include <iostream>
#include <vector>
using namespace std;

// this allocator only performs allocation once, at the beginning of the program.
// define the following to specify how much memory should be allocated.
#ifndef ALLOC_ONCE_MEMORY
    #define ALLOC_ONCE_MEMORY (1ULL<<32) /* default: 4 GB */
#endif

#define MIN(a, b) ((a) < (b) ? (a) : (b))

template<typename T = void>
class allocator_once : public allocator_interface<T> {
private:
    const int cachelines;    // # cachelines needed to store an object of type T
    // for bump allocation from a contiguous chunk of memory
    T ** mem;             // mem[tid] = pointer to current array to perform bump allocation from
    size_t * memBytes;       // memBytes[tid*PREFETCH_SIZE_WORDS] = size of mem in bytes
    T ** current;         // current[tid*PREFETCH_SIZE_WORDS] = pointer to current position in array mem

    T* bump_memory_next(const int tid) {
        T* result = current[tid*PREFETCH_SIZE_WORDS];
        current[tid*PREFETCH_SIZE_WORDS] = (T*) (((char*) current[tid*PREFETCH_SIZE_WORDS]) + (cachelines*BYTES_IN_CACHE_LINE));
        return result;
    }
    int bump_memory_bytes_remaining(const int tid) {
        return (((char*) mem[tid])+memBytes[tid*PREFETCH_SIZE_WORDS]) - ((char*) current[tid*PREFETCH_SIZE_WORDS]);
    }
    bool bump_memory_full(const int tid) {
        return (((char*) current[tid*PREFETCH_SIZE_WORDS])+cachelines*BYTES_IN_CACHE_LINE > ((char*) mem[tid])+memBytes[tid*PREFETCH_SIZE_WORDS]);
    }

public:
    template<typename _Tp1>
    struct rebind {
        typedef allocator_once<_Tp1> other;
    };

    // reserve space for ONE object of type T
    T* allocate(const int tid) {
        if (bump_memory_full(tid)) return NULL;
        return bump_memory_next(tid);
    }
    void static deallocate(const int tid, T * const p) {
        // no op for this allocator; memory is freed only by the destructor.
        // however, we have to call the destructor for the object manually...
        p->~T();
    }
    void deallocateAndClear(const int tid, blockbag<T> * const bag) {
        // the bag is cleared, which makes it seem like we're leaking memory,
        // but it will be freed in the destructor as we release the huge
        // slabs of memory.
        bag->clearWithoutFreeingElements();
    }

    void debugPrintStatus(const int tid) {}
    
    void initThread(const int tid) {
//        // touch each page of memory before our trial starts
//        long pagesize = sysconf(_SC_PAGE_SIZE);
//        int last = (int) (memBytes[tid*PREFETCH_SIZE_WORDS]/pagesize);
//        VERBOSE COUTATOMICTID("touching each page... memBytes="<<memBytes[tid*PREFETCH_SIZE_WORDS]<<" pagesize="<<pagesize<<" last="<<last<<std::endl);
//        for (int i=0;i<last;++i) {
//            TRACE COUTATOMICTID("    "<<tid<<" touching page "<<i<<" at address "<<(long)((long*)(((char*) mem[tid])+i*pagesize))<<std::endl);
//            *((long*)(((char*) mem[tid])+i*pagesize)) = 0;
//        }
//        VERBOSE COUTATOMICTID(" finished touching each page."<<std::endl);
    }

    allocator_once(const int numProcesses, debugInfo * const _debug)
            : allocator_interface<T>(numProcesses, _debug)
            , cachelines((sizeof(T)+(BYTES_IN_CACHE_LINE-1))/BYTES_IN_CACHE_LINE) {
        VERBOSE DEBUG COUTATOMIC("constructor allocator_once"<<std::endl);
        mem = new T*[numProcesses];
        memBytes = new size_t[numProcesses*PREFETCH_SIZE_WORDS];
        current = new T*[numProcesses*PREFETCH_SIZE_WORDS];
        for (int tid=0;tid<numProcesses;++tid) {
            long long newSizeBytes = ALLOC_ONCE_MEMORY / numProcesses; // divide several GB amongst all threads.
            VERBOSE COUTATOMIC("newSizeBytes        = "<<newSizeBytes<<std::endl);
            assert((newSizeBytes % (cachelines*BYTES_IN_CACHE_LINE)) == 0);

            mem[tid] = (T*) malloc((size_t) newSizeBytes);
            if (mem[tid] == NULL) {
                cerr<<"could not allocate memory"<<std::endl;
                exit(-1);
            }
            //COUTATOMIC("successfully allocated"<<std::endl);
            memBytes[tid*PREFETCH_SIZE_WORDS] = (size_t) newSizeBytes;
            current[tid*PREFETCH_SIZE_WORDS] = mem[tid];
            // align on cacheline boundary
            int mod = (int) (((long) mem[tid]) % BYTES_IN_CACHE_LINE);
            if (mod > 0) {
                // we are ignoring the first mod bytes of mem, because if we
                // use them, we will not be aligning objects to cache lines.
                current[tid*PREFETCH_SIZE_WORDS] = (T*) (((char*) mem[tid]) + BYTES_IN_CACHE_LINE - mod);
            } else {
                current[tid*PREFETCH_SIZE_WORDS] = mem[tid];
            }
            assert((((long) current[tid*PREFETCH_SIZE_WORDS]) % BYTES_IN_CACHE_LINE) == 0);
        }
    }
    ~allocator_once() {
        long allocated = 0;
        for (int tid=0;tid<this->NUM_PROCESSES;++tid) {
            allocated += (((char*) current[tid*PREFETCH_SIZE_WORDS]) - ((char*) mem[tid]));
        }
        VERBOSE COUTATOMIC("destructor allocator_once allocated="<<allocated<<" bytes, or "<<(allocated/(cachelines*BYTES_IN_CACHE_LINE))<<" objects of size "<<sizeof(T)<<" occupying "<<cachelines<<" cache lines"<<std::endl);
        // free all allocated blocks of memory
        for (int tid=0;tid<this->NUM_PROCESSES;++tid) {
            delete mem[tid];
        }
        delete[] mem;
        delete[] memBytes;
        delete[] current;
    }
};
#endif	/* ALLOC_ONCE_H */


================================================
FILE: datastructures/trevor_brown_abtree/common/recordmgr/arraylist.h
================================================
/**
 * C++ record manager implementation (PODC 2015) by Trevor Brown.
 * 
 * Copyright (C) 2015 Trevor Brown
 *
 */

#ifndef ARRAYLIST_H
#define	ARRAYLIST_H

#include <cassert>
#include <iostream>
#include <atomic>
#include "plaf.h"
#include "globals.h"
using namespace std;

// this list allows multiple readers, but only ONE writer.
// i don't know if it is linearizable; maybe linearize at __size.load()/store()
template <typename T>
class AtomicArrayList {
private:
    atomic_int __size;
    atomic_uintptr_t *data;
public:
    const int capacity;
    AtomicArrayList(const int _capacity) : capacity(_capacity) {
        VERBOSE DEBUG COUTATOMIC("constructor AtomicArrayList capacity="<<capacity<<std::endl);
        __size.store(0, memory_order_relaxed);
        data = new atomic_uintptr_t[capacity];
    }
    ~AtomicArrayList() {
        delete[] data;
    }
    inline T* get(const int ix) {
        return (T*) data[ix].load(memory_order_relaxed);
    }
    inline int size() {
        return __size.load(memory_order_relaxed); // note: this must be seq_cst if membars are not manually added
    }
    inline void add(T * const obj) {
        int sz = __size.load(memory_order_relaxed);
        assert(sz < capacity);
        SOFTWARE_BARRIER;
        data[sz].store((uintptr_t) obj, memory_order_relaxed);
        SOFTWARE_BARRIER;
        __size.store(sz+1, memory_order_relaxed); // note: this must be seq_cst if membars are not manually added
    }
    inline void erase(const int ix) {
        int sz = __size.load(memory_order_relaxed);
        assert(ix >= 0 && ix < sz);
        if (ix != sz-1) data[ix].store(data[sz-1].load(memory_order_relaxed), memory_order_relaxed);
        __size.store(sz-1, memory_order_relaxed); // note: this must be seq_cst if membars are not manually added
    }
    inline void erase(T * const obj) {
        int ix = getIndex(obj);
        if (ix != -1) erase(ix);
    }
    inline int getIndex(T * const obj) {
        int sz = __size.load(memory_order_relaxed); // note: this must be seq_cst if membars are not manually added
        for (int i=0;i<sz;++i) {
            if (data[i].load(memory_order_relaxed) == (uintptr_t) obj) return i;
        }
        return -1;
    }
    inline bool contains(T * const obj) {
        return (getIndex(obj) != -1);
    }
    inline void clear() {
        SOFTWARE_BARRIER;
        __size.store(0, memory_order_relaxed); // note: this must be seq_cst if membars are not manually added
        SOFTWARE_BARRIER;
    }
    inline bool isFull() {
        return __size.load(memory_order_relaxed) == capacity; // note: this must be seq_cst if membars are not manually added
    }
    inline bool isEmpty() {
        return __size.load(memory_order_relaxed) == 0; // note: this must be seq_cst if membars are not manually added
    }
};

template <typename T>
class ArrayList {
private:
    int __size;
    T **data;
public:
    const int capacity;
    ArrayList(const int _capacity) : capacity(_capacity) {
        __size = 0;
        data = new T*[capacity];
    }
    ~ArrayList() {
        delete[] data;
    }
    inline T* get(const int ix) {
        return data[ix];
    }
    inline int size() {
        return __size;
    }
    inline void add(T * const obj) {
        assert(__size < capacity);
        data[__size++] = obj;
    }
    inline void erase(const int ix) {
        assert(ix >= 0 && ix < __size);
        data[ix] = data[--__size];
    }
    inline void erase(T * const obj) {
        int ix = getIndex(obj);
        if (ix != -1) erase(ix);
    }
    inline int getIndex(T * const obj) {
        for (int i=0;i<__size;++i) {
            if (data[i] == obj) return i;
        }
        return -1;
    }
    inline bool contains(T * const obj) {
        return (getIndex(obj) != -1);
    }
    inline void clear() {
        __size = 0;
    }
    inline bool isFull() {
        return __size == capacity;
    }
    inline bool isEmpty() {
        return __size == 0;
    }
};


#endif	/* ARRAYLIST_H */


================================================
FILE: datastructures/trevor_brown_abtree/common/recordmgr/blockbag.h
================================================
/**
 * C++ record manager implementation (PODC 2015) by Trevor Brown.
 * 
 * Copyright (C) 2015 Trevor Brown
 *
 */

#ifndef BLOCKLIST_H
#define	BLOCKLIST_H

#include <cassert>
#include <iostream>
#include "blockpool.h"
#include "plaf.h"
using namespace std;

template <typename T>
class blockpool;

template <typename T>
class blockbag;

template <typename T>
class block;

#include "lockfreeblockbag.h"

// BLOCK_SIZE must be a power of two, or else the bitwise math is invalid.
#define BLOCK_SIZE (1<<8)
    
    template <typename T>
    class block { // stack implemented as an array
        private:
            T * data[BLOCK_SIZE];
            int size;
        public:
            block<T> *next;
            
            block(block<T> * const _next) : next(_next) {
                size = 0;
            }
            ~block() {
                assert(size == 0);
            }
            
            bool isFull() {
                return size == BLOCK_SIZE;
            }
            bool isEmpty() {
                return size == 0;
            }
            // precondition: !isFull()
            void push(T * const obj) {
                assert(size < BLOCK_SIZE);
                const int sz = size;
                //assert(interruptible[((long) ((int *) pthread_getspecific(pthreadkey)))*PREFETCH_SIZE_WORDS] == false);
                data[size] = obj;
                SOFTWARE_BARRIER;
                size = sz+1;
            }
            // precondition: !isEmpty()
            T* pop() {
                assert(size > 0);
                const int sz = size-1;
                size = sz;
                return data[sz];
            }
            T* peek(const int ix) {
                assert(ix >= 0);
                //assert(ix < size);
                return data[ix];
            }
            // warning: linear time
            bool contains(T* const obj) {
                for (int i=0;i<size;++i) {
                    if (data[i] == obj) return true;
                }
                return false;
            }
            // warning: linear time
            // however, it is constant time to erase the last thing you pushed.
            void erase(T* const obj) {
                if (size == 0) return; // empty
                assert(size > 0);
                if (data[size-1] == obj) {
                    --size; // erase last pushed item
                    return;
                }
                // the things we want to remove are probably the oldest,
                // so we iterate forward (head of stack = data[size-1])
                for (int i=0;i<size-1;++i) {
                    if (data[i] == obj) {
                        data[i] = data[size-1];
                        SOFTWARE_BARRIER;
                        --size;
                        return;
                    }
                }
            }
            void erase(const int ix) {
                if (size) {
                    assert(size > 0);
                    if (ix != size-1) {
                        data[ix] = data[size-1];
                    }
                    SOFTWARE_BARRIER;
                    --size; // erase last item
                }
            }
            void replace(const int ix, T* const obj) {
                assert(ix >= 0);
                assert(ix < size);
                assert(obj);
                data[ix] = obj;
            }
            int computeSize() {
                return size;
            }
            // this function is occasionally useful if, for instance,
            // you use a bump allocator, which hands out objects from
            // a huge slab of memory.
            // then, in the destructor for a data structure, we can clear
            // a block without worrying about leaking memory,
            // since we will just free the whole slab at once.
            void clearWithoutFreeingElements() {
                SOFTWARE_BARRIER;
                size = 0;
                SOFTWARE_BARRIER;
            }
    };

    template <typename T>
    class blockbag_iterator {
    private:
        blockbag<T> * const bag;
        block<T> * const head;
        block<T> * curr;
        int ix;
//        long long reclaimCountStart;
#ifdef BLOCKBAG_ITERATOR_COUNT_BLOCKS_TRAVERSED
        int sizeInBlocks;
        int blocksTraversed;
#endif
#ifdef BLOCKBAG_ITERATOR_COUNT_STEPS
        int steps;
#endif
    public:
        block<T> *getCurr() const { return curr; }
        int getIndex() const { return ix; }
        
        blockbag_iterator(block<T> * const _head, blockbag<T> * const _bag) 
                : bag(_bag), head(_head) {
#ifdef BLOCKBAG_ITERATOR_COUNT_STEPS
            steps = 0;
#endif
//            reclaimCountStart = bag->getReclaimCount();
//            assert((reclaimCountStart % 1) == 0);
//            if (reclaimCountStart % 1) {
//                // bag is currently being reclaimed. we cannot traverse it.
//                curr = NULL;
//                ix = -1;
//            } else {
                curr = head;
                ix = -1;
                if (curr) {
                    ix = curr->computeSize(); // "linearize" here
                    (*this)++;
                }
//            }
#ifdef BLOCKBAG_ITERATOR_COUNT_BLOCKS_TRAVERSED
                sizeInBlocks = bag->sizeInBlocks;
                blocksTraversed = 0;
#endif
        }
        inline T* operator*() const {
#ifdef BLOCKBAG_ITERATOR_COUNT_STEPS
            if (ix < 0) std::cout<<"bag="<<bag<<" head="<<head<<" curr="<<curr<<" ix="<<ix<<" steps="<<steps<<std::endl;
#endif
//            /****** start consistency check for concurrent iteration ******/
//            assert(reclaimCountStart == bag->getReclaimCount());
//            if (reclaimCountStart != bag->getReclaimCount()) {
//                // bag is being/has been reclaimed, so we cannot iterate
//                return NULL;
//            }
//            /******* end consistency check for concurrent iteration *******/
            return curr->peek(ix);
        }
        inline blockbag_iterator<T>& operator++(int) {
#ifdef BLOCKBAG_ITERATOR_COUNT_STEPS
            ++steps;
#endif
            --ix;
            if (ix < 0) {
#ifdef BLOCKBAG_ITERATOR_COUNT_BLOCKS_TRAVERSED
                ++blocksTraversed;
                if (blocksTraversed > sizeInBlocks + 1) {
                    std::cout<<"ERROR: too many blocks traversed! traversed "<<blocksTraversed<<" when we expected at most 1+"<<sizeInBlocks<<std::endl;
                    exit(-1);
                }
                assert(blocksTraversed <= sizeInBlocks + 1);
#endif
                curr = curr->next;                                              // race condition: if reclamation happens AND curr is freed along with too many other blocks to fit in the blockpool, then this access might fault
//                /****** start consistency check for concurrent iteration ******/
//                assert(reclaimCountStart == bag->getReclaimCount());
//                if (reclaimCountStart != bag->getReclaimCount()) {
//                    // bag is being/has been reclaimed, so we cannot iterate
//                    curr = NULL;
//                    ix = -1;
//                    return *this;
//                }
//                /******* end consistency check for concurrent iteration *******/
                ix = (curr ? curr->computeSize()-1 : -1);                       // race condition: if reclamation happens AND curr is freed along with too many other blocks to fit in the blockpool, then this access might fault
//                /****** start consistency check for concurrent iteration ******/
//                assert(reclaimCountStart == bag->getReclaimCount());
//                if (reclaimCountStart != bag->getReclaimCount()) {
//                    // bag is being/has been reclaimed, so we cannot iterate
//                    curr = NULL;
//                    ix = -1;
//                    return *this;
//                }
//                /******* end consistency check for concurrent iteration *******/
            }
            return *this;
        }
        void swap(block<T> * const otherCurr, const int otherIx) {
            T * const temp = otherCurr->peek(otherIx);
            otherCurr->replace(otherIx, curr->peek(ix));
            curr->replace(ix, temp);
        }
        // erases the current item
        void erase() {
            assert(curr);
            assert(!curr->isEmpty());
            bool result = bag->erase(curr, ix);
            if (ix >= curr->computeSize()) {
                (*this)++;
            }
            if (result) {
                (*this)++;
            }
        }
    };
    template <typename T>
    inline bool operator==(const blockbag_iterator<T>& a, const blockbag_iterator<T>& b) {
        if (a.getCurr() != b.getCurr()) return false;
        if (a.getIndex() != b.getIndex()) return false;
        return true;
    }
    template <typename T>
    inline bool operator!=(const blockbag_iterator<T>& a, const blockbag_iterator<T>& b) {
        return !(a == b);
    }
    
    // bag implemented with linked list whose nodes are blocks.
    // invariant: head and tail are never NULL
    // invariant: head is not full (computeSize() < BLOCK_SIZE)
    // invariant: all blocks except for the head are full
    // invariant: the bag is empty iff head is empty and head->next is null
    template <typename T>
    class blockbag {
    private:
        int owner;
        volatile long long reclaimCount; // number of times this bag has been the oldest epoch bag and had its nodes reclaimed
        long debugFreed;
    public:
        int sizeInBlocks;
    private:
        
        block<T> *head;
        block<T> *tail;
        
        void validate() {
            // invariant: head and tail are never NULL
            assert(head);
            // invariant: head and tail are never NULL
            assert(tail);
            // invariant: head is not full (computeSize() < BLOCK_SIZE)
            assert(!head->isFull());
            // invariant: all blocks except for the head are full
            block<T> *curr = head->next;
            while (curr) {
                assert(curr->isFull());
                curr = curr->next;
            }
            // invariant: sizeInBlocks is correct
            assert(sizeInBlocks == computeSizeInBlocks());
        }
        
        blockpool<T> * const pool;
        
        void debugPrintBag() {
            std::cout<<"("<<computeSize()<<","<<computeSizeInBlocks()<<") =";
            block<T> * curr = head;
            while (curr) {
                std::cout<<" "<<curr->computeSize()<<"["<<((long)curr)<<"]";
                curr = curr->next;
            }
        }
        int computeSizeInBlocks() {
            int result = 0;
            block<T> *curr = head;
            while (curr) {
                ++result;
                curr = curr->next;
            }
            return result;
        }
        
    public:
        blockbag(const int tid, blockpool<T> * const _pool) : pool(_pool) {
//            VERBOSE DEBUG std::cout<<"constructor blockbag"<<std::endl;
            owner = tid;
//            std::cout<<"bag owner="<<owner<<std::endl;
            reclaimCount = 0;
            debugFreed = 0;
            sizeInBlocks = 1;
            head = pool->allocateBlock(NULL);
            tail = head;
            DEBUG2 assert(computeSizeInBlocks() == sizeInBlocks);
            DEBUG2 assert(computeSize() == 0);
            DEBUG2 validate();
        }
        ~blockbag() {
//            VERBOSE DEBUG std::cout<<"destructor blockbag;";
            assert(isEmpty());
            // clear the bag AND FREE EVERY BLOCK IN IT
            while (head) {
                block<T> * const temp = head;
                head = head->next;
                //DEBUG ++debugFreed;
                pool->deallocateBlock(temp);
            }
//            VERBOSE DEBUG std::cout<<" freed "<<debugFreed<<std::endl;
        }
        
        int getOwner() {
            return owner;
        }
        
        inline void incrementReclaimCount() {
            SOFTWARE_BARRIER;
            ++reclaimCount;
            SOFTWARE_BARRIER;
        }
        inline long long getReclaimCount() {
            SOFTWARE_BARRIER;
            return reclaimCount;
        }
        
        blockbag_iterator<T> begin() {
            return blockbag_iterator<T>(head, this);
        }
        blockbag_iterator<T> end() {
            return blockbag_iterator<T>(NULL, this);
        }

        void add(T * const obj) {
            DEBUG2 validate();
            int oldsize; DEBUG2 oldsize = computeSize();
            head->push(obj);
            if (head->isFull()) {
                int oldNumBlocks; DEBUG2 oldNumBlocks = computeSizeInBlocks();
                block<T> *newblock = pool->allocateBlock(head);
                ++sizeInBlocks;
                //DEBUG2 std::cout<<"((("<<((long)head)<<" full. prepending "<<((long)newblock)<<")))";
                SOFTWARE_BARRIER;
                head = newblock;
                DEBUG2 assert(oldNumBlocks + 1 == computeSizeInBlocks());
                DEBUG2 assert(sizeInBlocks == computeSizeInBlocks());
            }
            DEBUG2 assert(oldsize + 1 == computeSize());
            DEBUG2 validate();
        }
        
        template <typename Alloc>
        void add(const int tid, T * const obj, lockfreeblockbag<T> * const sharedBag, const int thresh, Alloc * const alloc) {
            DEBUG2 validate();
            int oldsize; DEBUG2 oldsize = computeSize();
            head->push(obj);
            if (head->isFull()) {
                int oldNumBlocks; DEBUG2 oldNumBlocks = computeSizeInBlocks();
                block<T> *newblock = pool->allocateBlock(head);
                ++sizeInBlocks;
                //DEBUG2 std::cout<<"((("<<((long)head)<<" full. prepending "<<((long)newblock)<<")))";
                head = newblock;
                DEBUG2 assert(oldNumBlocks + 1 == computeSizeInBlocks());
                DEBUG2 assert(sizeInBlocks == computeSizeInBlocks());
                DEBUG2 assert(oldsize + 1 == computeSize());
                if (sizeInBlocks > thresh) {
                    block<T> *b = removeFullBlock(); // returns NULL if freeBag has < 2 full blocks
                    assert(b);
                    sharedBag->addBlock(b);
                    MEMORY_STATS alloc->debug->addGiven(tid, 1);
                    //DEBUG2 COUTATOMIC("  thread "<<this->tid<<" sharedBag("<<(sizeof(T)==sizeof(Node<long,long>)?"Node":"SCXRecord")<<") now contains "<<sharedBag->size()<<" blocks"<<std::endl);
                    DEBUG2 assert(oldsize + 1 - BLOCK_SIZE == computeSize());
                }
            }
            DEBUG2 validate();
        }
        bool isEmpty() {
            return head->next == NULL && head->isEmpty();
        }
        // precondition: !isEmpty, !curr->isEmpty()
        // returns true if a subsequent invocation of curr->peek(ix) will return
        //         an item that was previously EARLIER in iterator order, and false otherwise.
        bool erase(block<T> * const curr, const int ix) {
            assert(!isEmpty());
            assert(!curr->isEmpty());
            DEBUG2 validate();
            if (head->isEmpty()) {
                // current block cannot be head, since head is empty
                assert(curr != head);
                
                // eliminate empty head block, since next block will now be non-full
                block<T> * const temp = head;
                head = head->next;
                pool->deallocateBlock(temp);
                --sizeInBlocks;
            }
            assert(!head->isEmpty());
            
            // case 1: curr is the new head
            if (curr == head) {
                // erase from head block
                head->erase(ix);
                DEBUG2 validate();
                return false;
            
            // case 2: curr is not the head
            } else {
                assert(!head->isEmpty());
                // we use head->pop() to retrieve
                // some object from the head block.
                // then, we replace the object to be erased
                // with the object taken from the head block.
                T* obj = head->pop();
                curr->replace(ix, obj);
                DEBUG2 validate();
                return true;
            }
        }
        // precondition: !isEmpty()
        T* remove() {
            assert(!isEmpty());
            DEBUG2 validate();
            int oldsize; DEBUG2 oldsize = computeSize();
            T *result;
            if (head->isEmpty()) {
                result = head->next->pop();
                int oldNumBlocks; DEBUG2 oldNumBlocks = computeSizeInBlocks();
                block<T> * const temp = head;
                head = head->next;
                pool->deallocateBlock(temp);
                --sizeInBlocks;
                DEBUG2 assert(oldNumBlocks - 1 == computeSizeInBlocks());
                DEBUG2 assert(sizeInBlocks == computeSizeInBlocks());
                DEBUG2 assert(oldsize - 1 == computeSize());
                DEBUG2 validate();
                return result;
            } else {
                result = head->pop();
                DEBUG2 validate();
                return result;
            }
        }
        
        
        ////////// not anymore // precondition: !isEmpty()
        template <typename Alloc>
        T* remove(const int tid, lockfreeblockbag<T> * const sharedBag, Alloc * const alloc) {
            //assert(!isEmpty());
            DEBUG2 validate();
            int oldsize; DEBUG2 oldsize = computeSize();
            T *result;
            if (head->isEmpty()) {
                if (head->next) {
                    result = head->next->pop();
                    int oldNumBlocks; DEBUG2 oldNumBlocks = computeSizeInBlocks();
                    block<T> * const temp = head;
                    head = head->next;
                    pool->deallocateBlock(temp);
                    --sizeInBlocks;
                    DEBUG2 assert(oldNumBlocks - 1 == computeSizeInBlocks());
                    DEBUG2 assert(sizeInBlocks == computeSizeInBlocks());
                    DEBUG2 assert(oldsize - 1 == computeSize());
//                    if (sizeInBlocks == 1) {
//                        block<T> *b = sharedBag->getBlock();
//                        if (b) {
//                            addFullBlock(b);
//                            //DEBUG this->debug->addTaken(tid, 1);
//                            //DEBUG2 COUTATOMIC("  thread "<<this->tid<<" took "<<b->computeSize()<<" objects from sharedBag"<<std::endl);
//                        } else {
//                            /** begin debug **/
//                            for (int i=0;i<BLOCK_SIZE-1;++i) {
//                                add(alloc->allocate(tid));
//                            }
//                            /** end debug **/
//                        }
//                    }
//                    assert(sizeInBlocks > 1);
                    DEBUG2 validate();
//                    MEMORY_STATS2 alloc->debug->addFromPool(tid, 1);
                    return result;
                } else {
                    block<T> *b = sharedBag->getBlock();
                    if (b) {
                        addFullBlock(b);
                        MEMORY_STATS alloc->debug->addTaken(tid, 1);
                        //DEBUG2 COUTATOMIC("  thread "<<this->tid<<" took "<<b->computeSize()<<" objects from sharedBag"<<std::endl);
                        return remove(/*tid, sharedBag, alloc*/);
                    } else {
//                        return alloc->allocate(tid);
                        /** begin debug **/
                        // allocate entire block worth of objects
                        for (int i=0;i<BLOCK_SIZE;++i) {
                            add(alloc->allocate(tid));
                        }
                        /** end debug **/
                        assert(sizeInBlocks > 1);
                        DEBUG2 validate();
                        return remove(/*tid, sharedBag, alloc*/);
                    }
                }
            } else {
//                MEMORY_STATS2 alloc->debug->addFromPool(tid, 1);
                result = head->pop();
                DEBUG2 validate();
                return result;
            }
        }
        
        // removes and returns a full block if the list contains
        // at least two full blocks. otherwise, this returns NULL;
        block<T>* removeFullBlock() {
            DEBUG2 validate();
            int oldsize; DEBUG2 oldsize = computeSize();
            int oldNumBlocks; DEBUG2 oldNumBlocks = computeSizeInBlocks();
            block<T> *second = head->next;
            if (second != NULL) {
                if (second->next != NULL) {
                    assert(second->computeSize() == BLOCK_SIZE);
                    head->next = second->next;
                    second->next = NULL; // not technically necessary, but safer
                    --sizeInBlocks;
                    DEBUG2 assert(oldNumBlocks - 1 == computeSizeInBlocks());
                    DEBUG2 assert(oldsize - BLOCK_SIZE == computeSize());
                    DEBUG2 assert(sizeInBlocks == computeSizeInBlocks());
                    DEBUG2 validate();
                    return second;
                }
            }
            DEBUG2 assert(oldsize == computeSize());
            DEBUG2 if (sizeInBlocks != computeSizeInBlocks()) { std::cout<<"sizeInBlocks="<<sizeInBlocks<<" compute="<<computeSizeInBlocks()<<std::endl; }
            DEBUG2 assert(sizeInBlocks == computeSizeInBlocks());
            DEBUG2 validate();
            return 0;
        }
        void addFullBlock(block<T> *b) {
            DEBUG2 validate();
            assert(b->computeSize() == BLOCK_SIZE);
            assert(b->next == NULL);
            int oldsize; DEBUG2 oldsize = computeSize();
            int oldNumBlocks; DEBUG2 oldNumBlocks = computeSizeInBlocks();
            tail->next = b;
            tail = b;
            ++sizeInBlocks;
            DEBUG2 assert(oldNumBlocks + 1 == computeSizeInBlocks());
            DEBUG2 assert(oldsize + BLOCK_SIZE == computeSize());
            DEBUG2 assert(sizeInBlocks == computeSizeInBlocks());
            DEBUG2 validate();
        }
//        void appendMoveFullBlocks(blockbag<T> * const other) {
//            assert(other);
//            assert(other->head);
//            DEBUG2 validate();
//            
//            // other consists of one non-full block followed by
//            // zero or more full blocks.
//            // we want this list to contain only full blocks,
//            // except for the head block, so we move any full blocks
//            // from other to this list.
//            // if the other blockbag has a non-empty first block, we simply
//            // ignore it.
//            // (it doesn't matter if we leave a small amount of objects in
//            //  the other bag; they'll be appended to another list later,
//            //  when more blocks become full.)
//
//            // if other contains any full blocks, we append them to this list.
//            if (other->head->next != NULL) {
//                DEBUG2 assert(other->head->next->computeSize() == BLOCK_SIZE);
//                assert(other->head->next->isFull());
//                // append all but the head of the other bag to the end of this bag
//                sizeInBlocks += (other->getSizeInBlocks() - 1);
//                tail->next = other->head->next;
//                tail = other->tail;
//                assert(head && tail);
//                // remove all but the head of the other bag
//                other->head->next = NULL;
//                other->tail = other->head;
//                other->sizeInBlocks = 1;
//                assert(other->head && other->tail);
//            }
//            DEBUG2 other->validate();
//            DEBUG2 validate();
//        }
//        block<T> * const getPredecessorBlock(block<T> * const curr) {
//            block<T> * result = head;
//            while (result && result != curr) {
//                result = result->next;
//            }
//            return result;
//        }
        void appendMoveFullBlocks(blockbag<T> * const other, block<T> * predecessor) {
            assert(other);
            assert(other->head);
            assert(predecessor);
            DEBUG2 validate();
            
            // other consists of one maybe-full block followed by
            // zero or more full blocks.
            // our goal is to append all blocks in the other bag
            // starting with predecessor->next to our own bag.
            if (predecessor->next != NULL) {
                DEBUG2 assert(predecessor->next->computeSize() == BLOCK_SIZE);
                assert(predecessor->next->isFull());
                tail->next = predecessor->next;
                tail = other->tail;
                assert(head && tail);
                sizeInBlocks = computeSizeInBlocks();
                // remove all blocks after predecessor in the other bag
                predecessor->next = NULL;
                other->tail = predecessor;
                other->sizeInBlocks = other->computeSizeInBlocks();
                assert(other->head && other->tail);
            }
            DEBUG2 other->validate();
            DEBUG2 validate();
        }
        void appendMoveFullBlocks(blockbag<T> * const other) {
            appendMoveFullBlocks(other, other->head);
        }
        void appendMoveAll(blockbag<T> * const other) {
            assert(other);
            DEBUG2 validate();
            appendMoveFullBlocks(other);
            while (!other->isEmpty()) {
                add(other->remove());
            }
            sizeInBlocks = computeSizeInBlocks();
            DEBUG2 validate();
        }
        int computeSize() {
            int result = 0;
            block<T> *curr = head;
            while (curr) {
                result += curr->computeSize();
                curr = curr->next;
            }
            return result;
        }
        int getSizeInBlocks() {
            return sizeInBlocks;
        }
        // this function is occasionally useful if, for instance,
        // you use a bump allocator, which hands out objects from
        // a huge slab of memory.
        // then, in the destructor for a data structure, we can clear
        // a blockbag without worrying about leaking memory,
        // since we will just free the whole slab at once.
        void clearWithoutFreeingElements() {
            // free all blocks except for head.
            // we still have to do this, even if we don't have to
            // free elements of type T, since blocks are always
            // allocated using a blockpool, and we will leak memory
            // if we don't return blocks to the pool.
            DEBUG2 validate();
            block<T> * curr = head->next;
            while (curr) {
                block<T> * const temp = curr;
                curr = curr->next;
                temp->clearWithoutFreeingElements();
                this->pool->deallocateBlock(temp);
            }
            // fix up the head/tail pointers
            // and clear the remaining head block
            head->next = NULL;
            tail = head;
            head->clearWithoutFreeingElements();
            sizeInBlocks = 1;
            DEBUG2 validate();
        }
    };

#endif	/* BLOCKBAG_H */


================================================
FILE: datastructures/trevor_brown_abtree/common/recordmgr/blockpool.h
================================================
/**
 * C++ record manager implementation (PODC 2015) by Trevor Brown.
 * 
 * Copyright (C) 2015 Trevor Brown
 *
 */

#ifndef BLOCKPOOL_H
#define	BLOCKPOOL_H

#include "blockbag.h"
#include "plaf.h"
#include <iostream>
using namespace std;

#define MAX_BLOCK_POOL_SIZE 32

#ifndef VERBOSE
#define VERBOSE if(0)
#endif

template <typename T>
class block;

template <typename T>
class blockpool {
private:
    block<T> *pool[MAX_BLOCK_POOL_SIZE];
    int poolSize;

    long debugAllocated;
    long debugPoolDeallocated;
    long debugPoolAllocated;
    long debugFreed;
public:
    blockpool() {
        poolSize = 0;
        debugAllocated = 0;
        debugPoolAllocated = 0;
        debugPoolDeallocated = 0;
        debugFreed = 0;
    }
    ~blockpool() {
        VERBOSE DEBUG std::cout<<"destructor blockpool;";
        for (int i=0;i<poolSize;++i) {
            //DEBUG ++debugFreed;
            assert(pool[i]->isEmpty());
            delete pool[i];                           // warning: uses locks (for some allocators)
        }
        VERBOSE DEBUG std::cout<<" blocks allocated "<<debugAllocated<<" pool-allocated "<<debugPoolAllocated<<" freed "<<debugFreed<<" pool-deallocated "<<debugPoolDeallocated<<std::endl;
    }
    block<T>* allocateBlock(block<T> * const next) {
        if (poolSize) {
            //DEBUG ++debugPoolAllocated;
            block<T> *result = pool[--poolSize]; // pop a block off the stack
            *result = block<T>(next);
            assert(result->next == next);
            assert(result->computeSize() == 0);
            assert(result->isEmpty());
            return result;
        } else {
            //DEBUG ++debugAllocated;
            return new block<T>(next);                // warning: uses locks (for some allocators)
        }
    }
    void deallocateBlock(block<T> * const b) {
        assert(b->isEmpty());
        if (poolSize == MAX_BLOCK_POOL_SIZE) {
            //DEBUG ++debugFreed;
//            assert(poolSize < MAX_BLOCK_POOL_SIZE); // for the RQ benchmarks, we want to assert that we never free a block
#ifndef NO_FREE
            delete b;                                 // warning: uses locks (for some allocators)
#endif
        } else {
            //DEBUG ++debugPoolDeallocated;
            pool[poolSize++] = b;
        }
    }
};

#endif	/* BLOCKPOOL_H */


================================================
FILE: datastructures/trevor_brown_abtree/common/recordmgr/debug_info.h
================================================
/**
 * C++ record manager implementation (PODC 2015) by Trevor Brown.
 * 
 * Copyright (C) 2015 Trevor Brown
 *
 */

#ifndef DEBUG_INFO_H
#define	DEBUG_INFO_H

#include "plaf.h"

struct _memrecl_counters {
    volatile char padding1[PREFETCH_SIZE_BYTES];
    long allocated;
    long deallocated;
    long fromPool;
    long toPool; // how many objects have been added to this pool
    long given; // how many blocks have been moved from this pool to a shared pool
    long taken; // how many blocks have been moved from a shared pool to this pool
    long retired; // how many objects have been retired
    volatile char padding2[PREFETCH_SIZE_BYTES];
};

class debugInfo {
private:
    const int NUM_PROCESSES;
    _memrecl_counters c[MAX_TID_POW2];
public:
    void clear() {
        for (int tid=0;tid<NUM_PROCESSES;++tid) {
            c[tid].allocated = 0;
            c[tid].deallocated = 0;
            c[tid].fromPool = 0;
            c[tid].toPool = 0;
            c[tid].given = 0;
            c[tid].taken = 0;
            c[tid].retired = 0;
        }
    }
    void addAllocated(const int tid, const int val) {
        c[tid].allocated += val;
    }
    void addDeallocated(const int tid, const int val) {
        c[tid].deallocated += val;
    }
    void addFromPool(const int tid, const int val) {
        c[tid].fromPool += val;
    }
    void addToPool(const int tid, const int val) {
        c[tid].toPool += val;
    }
    void addGiven(const int tid, const int val) {
        c[tid].given += val;
    }
    void addTaken(const int tid, const int val) {
        c[tid].taken += val;
    }
    void addRetired(const int tid, const int val) {
        c[tid].retired += val;
    }
    long getAllocated(const int tid) {
        return c[tid].allocated;
    }
    long getDeallocated(const int tid) {
        return c[tid].deallocated;
    }
    long getFromPool(const int tid) {
        return c[tid].fromPool;
    }
    long getToPool(const int tid) {
        return c[tid].toPool;
    }
    long getGiven(const int tid) {
        return c[tid].given;
    }
    long getTaken(const int tid) {
        return c[tid].taken;
    }
    long getRetired(const int tid) {
        return c[tid].retired;
    }
    long getTotalAllocated() {
        long result = 0;
        for (int tid=0;tid<NUM_PROCESSES;++tid) {
            result += getAllocated(tid);
        }
        return result;
    }
    long getTotalDeallocated() {
        long result = 0;
        for (int tid=0;tid<NUM_PROCESSES;++tid) {
            result += getDeallocated(tid);
        }
        return result;
    }
    long getTotalFromPool() {
        long result = 0;
        for (int tid=0;tid<NUM_PROCESSES;++tid) {
            result += getFromPool(tid);
        }
        return result;
    }
    long getTotalToPool() {
        long result = 0;
        for (int tid=0;tid<NUM_PROCESSES;++tid) {
            result += getToPool(tid);
        }
        return result;
    }
    long getTotalGiven() {
        long result = 0;
        for (int tid=0;tid<NUM_PROCESSES;++tid) {
            result += getGiven(tid);
        }
        return result;
    }
    long getTotalTaken() {
        long result = 0;
        for (int tid=0;tid<NUM_PROCESSES;++tid) {
            result += getTaken(tid);
        }
        return result;
    }
    long getTotalRetired() {
        long result = 0;
        for (int tid=0;tid<NUM_PROCESSES;++tid) {
            result += getRetired(tid);
        }
        return result;
    }
    debugInfo(int numProcesses) : NUM_PROCESSES(numProcesses) {
//        c = new _memrecl_counters[numProcesses];
        clear();
    }
    ~debugInfo() {
//        delete[] c;
    }
};

#endif	/* DEBUG_INFO_H */


================================================
FILE: datastructures/trevor_brown_abtree/common/recordmgr/debugcounter.h
================================================
/* 
 * File:   debugcounter.h
 * Author: trbot
 *
 * Created on September 27, 2015, 4:43 PM
 */

#ifndef DEBUGCOUNTER_H
#define	DEBUGCOUNTER_H

#include <string>
#include <sstream>
#include "plaf.h"
using namespace std;

class debugCounter {
private:
    const int NUM_PROCESSES;
    volatile long long * data; // data[tid*PREFETCH_SIZE_WORDS] = count for thread tid (padded to avoid false sharing)
public:
    void add(const int tid, const long long val) {
        data[tid*PREFETCH_SIZE_WORDS] += val;
    }
    void inc(const int tid) {
        add(tid, 1);
    }
    long long get(const int tid) {
        return data[tid*PREFETCH_SIZE_WORDS];
    }
    long long getTotal() {
        long long result = 0;
        for (int tid=0;tid<NUM_PROCESSES;++tid) {
            result += get(tid);
        }
        return result;
    }
    void clear() {
        for (int tid=0;tid<NUM_PROCESSES;++tid) {
            data[tid*PREFETCH_SIZE_WORDS] = 0;
        }
    }
    debugCounter(const int numProcesses) : NUM_PROCESSES(numProcesses) {
        data = new long long[numProcesses*PREFETCH_SIZE_WORDS];
        clear();
    }
    ~debugCounter() {
        delete[] data;
    }
};

#endif	/* DEBUGCOUNTER_H */


================================================
FILE: datastructures/trevor_brown_abtree/common/recordmgr/debugprinting.h
================================================
/* 
 * File:   debugprinting.h
 * Author: trbot
 *
 * Created on June 24, 2016, 12:49 PM
 */

#ifndef DEBUGPRINTING_H
#define	DEBUGPRINTING_H

#include <atomic>
#include <sstream>

#define COUTATOMIC(coutstr) /*cout<<coutstr*/ \
{ \
    std::stringstream ss; \
    ss<<coutstr; \
    std::cout<<ss.str(); \
}
#define COUTATOMICTID(coutstr) /*cout<<"tid="<<(tid<10?" ":"")<<tid<<": "<<coutstr*/ \
{ \
    std::stringstream ss; \
    ss<<"tid="<<tid<<(tid<10?" ":"")<<": "<<coutstr; \
    std::cout<<ss.str(); \
}

// define USE_TRACE if you want many paths through the code to be traced with std::cout<<"..." statements
#ifdef USE_TRACE
std::atomic_bool ___trace(1);
std::atomic_bool ___validateops(1);
#define TRACE_TOGGLE {bool ___t = ___trace; ___trace = !___t;}
#define TRACE_ON {___trace = true;}
#define TRACE if(___trace)
#else
#define TRACE if(0)
#define TRACE_TOGGLE
#define TRACE_ON 
#endif

#endif	/* DEBUGPRINTING_H */


================================================
FILE: datastructures/trevor_brown_abtree/common/recordmgr/globals.h
================================================
/**
 * C++ record manager implementation (PODC 2015) by Trevor Brown.
 * 
 * Copyright (C) 2015 Trevor Brown
 *
 */

#ifndef RECORDMGR_GLOBALS_H
#define	RECORDMGR_GLOBALS_H

#include "plaf.h"
#include "debugprinting.h"

#ifndef DEBUG
#define DEBUG if(0)
#define DEBUG2 if(0)
#endif

#ifndef MEMORY_STATS
#define MEMORY_STATS if(0)
#define MEMORY_STATS2 if(0)
#endif

// don't touch these options for crash recovery

#define CRASH_RECOVERY_USING_SETJMP
#define SEND_CRASH_RECOVERY_SIGNALS
#define AFTER_NEUTRALIZING_SET_BIT_AND_RETURN_TRUE
#define PERFORM_RESTART_IN_SIGHANDLER
#define SIGHANDLER_IDENTIFY_USING_PTHREAD_GETSPECIFIC

// some useful, data structure agnostic definitions

typedef bool CallbackReturn;
typedef void* CallbackArg;
typedef CallbackReturn (*CallbackType)(CallbackArg);
#ifndef SOFTWARE_BARRIER
#define SOFTWARE_BARRIER asm volatile("": : :"memory")
#endif

#endif	/* GLOBALS_H */


================================================
FILE: datastructures/trevor_brown_abtree/common/recordmgr/hashtable.h
================================================
/**
 * C++ record manager implementation (PODC 2015) by Trevor Brown.
 * 
 * Copyright (C) 2015 Trevor Brown
 *
 */

#ifndef HASHTABLE_H
#define	HASHTABLE_H

#include <cassert>
#include <cstdlib>
#include <iostream>
#include "plaf.h"
using namespace std;

namespace hashset_namespace {
// note: TABLE_SIZE must be a power of two for bitwise operations below to work
#define TABLE_SIZE 32
#define FIRST_INDEX(key) (hash((key)) & (TABLE_SIZE-1))
#define NEXT_INDEX(ix) ((ix)+1 % TABLE_SIZE)
#define EMPTY_CELL 0
    template<typename K>
    class hashset {
        private:
            bool cleared;
            K* keys[TABLE_SIZE];
            inline int hash(K * const key) {
                // MurmurHash3's integer finalizer
                long long k = (long long) key;
                k ^= k >> 33;
                k *= 0xff51afd7ed558ccd;
                k ^= k >> 33;
                k *= 0xc4ceb9fe1a85ec53;
                k ^= k >> 33;
                return k;
            }
            int getIndex(K * const key) {
                int ix;
                for (ix=FIRST_INDEX(key)
                        ; keys[ix] != EMPTY_CELL && keys[ix] != key
                        ; ix=NEXT_INDEX(ix)) {
                    assert(ix >= 0);
                    assert(ix < TABLE_SIZE);
                }
                assert(ix >= 0);
                assert(ix < TABLE_SIZE);
                return ix;
            }
        public:
            hashset() {
                VERBOSE DEBUG std::cout<<"constructor hashset"<<std::endl;
                cleared = false;
                clear();
            }
            ~hashset() {
                VERBOSE DEBUG std::cout<<"destructor hashset"<<std::endl;
            }
            void clear() {
                if (!cleared) {
                    memset(keys, EMPTY_CELL, TABLE_SIZE*sizeof(K*));
                    cleared = true;
                }
            }
            bool contains(K * const key) {
                return get(key) != EMPTY_CELL;
            }
            K* get(K * const key) {
                return keys[getIndex(key)];
            }
            void insert(K * const key) {
                int ix = getIndex(key);
                keys[ix] = key;
            }
            void erase(K * const key) {
                int ix = getIndex(key);
                // no need for an if statement, because keys[ix] is either key or null.
                keys[ix] = EMPTY_CELL;
            }
    } __attribute__ ((aligned(PREFETCH_SIZE_BYTES)));
    
//    // hash set that allows multiple readers and ONE updater.
//    // i am pretty certain this is NOT linearizable.
//    // note: to use this with reclaim_hazardptr_hash, this would need to
//    //       be a MULTISET!!!! this is because protectObject is called multiple times,
//    //       and a single unprotectObject must not unprotect the object!!!
//    template<typename K>
//    class AtomicHashSet {
//        private:
//            int size;       // NOT ATOMICALLY ACCESSIBLE BY OTHER THREADS THAN OWNER
//            bool cleared;   // NOT ATOMICALLY ACCESSIBLE BY OTHER THREADS THAN OWNER
//            atomic_uintptr_t keys[TABLE_SIZE];
//            inline int hash(K * const key) {
//                // MurmurHash3's integer finalizer
//                long long k = (long long) key;
//                k ^= k >> 33;
//                k *= 0xff51afd7ed558ccd;
//                k ^= k >> 33;
//                k *= 0xc4ceb9fe1a85ec53;
//                k ^= k >> 33;
//                return k;
//            }
//            int getIndex(K * const key) {
//                int ix;
//                for (ix=FIRST_INDEX(key)
//                        ; keys[ix] != EMPTY_CELL && keys[ix] != key
//                        ; ix=NEXT_INDEX(ix)) {
//                    assert(ix >= 0);
//                    assert(ix < TABLE_SIZE);
//                }
//                assert(ix >= 0);
//                assert(ix < TABLE_SIZE);
//                return ix;
//            }
//        public:
//            AtomicHashSet() {
//                VERBOSE DEBUG std::cout<<"constructor AtomicHashSet"<<std::endl;
//                cleared = false;
//                clear();
//            }
//            ~AtomicHashSet() {
//                VERBOSE DEBUG std::cout<<"destructor AtomicHashSet"<<std::endl;
//            }
//            void clear() {
//                if (!cleared) {
//                    memset(keys, EMPTY_CELL, TABLE_SIZE*sizeof(K*));
//                    cleared = true;
//                }
//            }
//            bool contains(K * const key) {
//                return get(key) != EMPTY_CELL;
//            }
//            K* get(K * const key) {
//                return (K*) keys[getIndex(key)].load();
//            }
//            void insert(K * const key) {
//                int ix = getIndex(key);
//                keys[ix].store(key);
//                ++size;
//            }
//            void erase(K * const key) {
//                int ix = getIndex(key);
//                // no need for an if statement, because keys[ix] is either key or null.
//                if (keys[ix] == EMPTY_CELL) {
//                    keys[ix].store(EMPTY_CELL);
//                    --size;
//                }
//            }
//    } __attribute__ ((aligned(BYTES_IN_CACHE_LINE)));

    template<typename K>
    class hashset_new {
    private:
        int tableSize;
        K** keys;
        int __size;
        inline long hash(K * const key) {
            // MurmurHash3's integer finalizer
            long long k = (long long) key;
            k ^= k >> 33;
            k *= 0xff51afd7ed558ccd;
            k ^= k >> 33;
            k *= 0xc4ceb9fe1a85ec53;
            k ^= k >> 33;
            return k;
        }
        inline int getIndex(K * const key) {
            int ix = firstIndex(key);
            assert(ix >= 0);
            assert(ix < tableSize);
            while (true) {
                if (keys[ix] == EMPTY_CELL || keys[ix] == key) {
                    return ix;
                }
                ix = nextIndex(ix);
                assert(ix >= 0);
                assert(ix < tableSize);
            }
        }
        inline int firstIndex(K * const key) {
            return (hash(key) & (tableSize-1));
        }
        inline int nextIndex(const int ix) {
            return ((ix+1) & (tableSize-1));
        }
    public:
        hashset_new(const int numberOfElements) {
            tableSize = 32;
            while (tableSize < numberOfElements*2) {
                tableSize *= 2;
            }
            VERBOSE DEBUG std::cout<<"constructor hashset_new capacity="<<tableSize<<std::endl;
            keys = new K*[tableSize];
            __size = -1;
            clear();
        }
        ~hashset_new() {
//            VERBOSE DEBUG std::cout<<"destructor hashset_new"<<std::endl;
            delete[] keys;
        }
        void clear() {
            if (__size) {
                memset(keys, EMPTY_CELL, tableSize*sizeof(K*));
                __size = 0;
            }
        }
        bool contains(K * const key) {
            return get(key) != EMPTY_CELL;
        }
        K* get(K * const key) {
            return keys[getIndex(key)];
        }
        void insert(K * const key) {
            int ix = getIndex(key);
            if (keys[ix] == EMPTY_CELL) {
                keys[ix] = key;
                ++__size;
                assert(__size < tableSize);
            }
        }
        int size() {
            return __size;
        }
    };

}

#endif	/* HASHTABLE_H */


================================================
FILE: datastructures/trevor_brown_abtree/common/recordmgr/lockfreeblockbag.h
================================================
/**
 * C++ record manager implementation (PODC 2015) by Trevor Brown.
 * 
 * Copyright (C) 2015 Trevor Brown
 *
 */

#ifndef LOCKFREESTACK_H
#define	LOCKFREESTACK_H

#include <atomic>
#include <iostream>
#include "blockbag.h"
using namespace std;

#ifndef VERBOSE
#define VERBOSE if(0)
#endif

// lock free bag that operates on elements of the block<T> type,
// defined in blockbag.h. this class does NOT allocate or deallocate any memory.
// instead, it simply chains blocks together using their next pointers.
// the implementation is a stack, with push and pop at the head.
// the aba problem is avoided using version numbers with a double-wide CAS.
// any contention issues with using a simple stack and overhead issues with
// double-wide CAS are unimportant, because operations on this bag only happen
// once a process has filled up two blocks of objects and needs to hand one
// off. thus, the number of operations on this class is several orders of
// magnitude smaller than the number of operations on the binary search tree.
template <typename T>
class lockfreeblockbag {
private:
    struct tagged_ptr {
        block<T> *ptr;
        long tag;
    };
    std::atomic<tagged_ptr> head;
public:
    lockfreeblockbag() {
        VERBOSE DEBUG std::cout<<"constructor lockfreeblockbag lockfree="<<head.is_lock_free()<<std::endl;
        assert(head.is_lock_free());
        head.store(tagged_ptr({NULL,0}));
    }
    ~lockfreeblockbag() {
        VERBOSE DEBUG std::cout<<"destructor lockfreeblockbag; ";
        block<T> *curr = head.load(memory_order_relaxed).ptr;
        int debugFreed = 0;
        while (curr) {
            block<T> * const temp = curr;
            curr = curr->next;
            //DEBUG ++debugFreed;
            delete temp;
        }
        VERBOSE DEBUG std::cout<<"freed "<<debugFreed<<std::endl;
    }
    block<T>* getBlock() {
        while (true) {
            tagged_ptr expHead = head.load(memory_order_relaxed);
            if (expHead.ptr != NULL) {
                if (head.compare_exchange_weak(
                        expHead,
                        tagged_ptr({expHead.ptr->next, expHead.tag+1}))) {
                    block<T> *result = expHead.ptr;
                    result->next = NULL;
                    return result;
                }
            } else {
                return NULL;
            }
        }
    }
    void addBlock(block<T> *b) {
        while (true) {
            tagged_ptr expHead = head.load(memory_order_relaxed);
            b->next = expHead.ptr;
            if (head.compare_exchange_weak(
                    expHead,
                    tagged_ptr({b, expHead.tag+1}))) {
                return;
            }
        }
    }
    // NOT thread safe
    int sizeInBlocks() {
        int result = 0;
        block<T> *curr = head.load(memory_order_relaxed).ptr;
        while (curr) {
            ++result;
            curr = curr->next;
        }
        return result;
    }
    // thread safe, but concurrent operations are very likely to starve it
    long long size() {
        while (1) {
            long long result = 0;
            block<T> *originalHead = head.load(memory_order_relaxed).ptr;
            block<T> *curr = originalHead;
            while (curr) {
                result += curr->computeSize();
                curr = curr->next;
            }
            if (head.load(memory_order_relaxed).ptr == originalHead) {
                return result;
            }
        }
    }
};

#endif	/* LOCKFREESTACK_H */


================================================
FILE: datastructures/trevor_brown_abtree/common/recordmgr/pool_interface.h
================================================
/**
 * C++ record manager implementation (PODC 2015) by Trevor Brown.
 * 
 * Copyright (C) 2015 Trevor Brown
 *
 */

#ifndef POOL_INTERFACE_H
#define	POOL_INTERFACE_H

#include <iostream>
#include "allocator_interface.h"
#include "debug_info.h"
#include "blockpool.h"
#include "blockbag.h"
using namespace std;

template <typename T = void, class Alloc = allocator_interface<T> >
class pool_interface {
public:
    debugInfo * const debug;
    
    const int NUM_PROCESSES;
    blockpool<T> **blockpools; // allocated (or not) and freed by descendants
    Alloc *alloc;

    template<typename _Tp1>
    struct rebind {
        typedef pool_interface<_Tp1, Alloc> other;
    };
    template<typename _Tp1, typename _Tp2>
    struct rebind2 {
        typedef pool_interface<_Tp1, _Tp2> other;
    };
    
    string getSizeString() { return ""; }
//    long long getSizeInNodes() { return 0; }
    /**
     * if the pool contains any object, then remove one from the pool
     * and return a pointer to it. otherwise, return NULL.
     */
    inline T* get(const int tid);
    inline void add(const int tid, T* ptr);
    inline void addMoveFullBlocks(const int tid, blockbag<T> *bag);
    inline void addMoveAll(const int tid, blockbag<T> *bag);
    inline int computeSize(const int tid);
    
    void debugPrintStatus(const int tid);
    
    pool_interface(const int numProcesses, Alloc * const _alloc, debugInfo * const _debug)
            : debug(_debug) 
            , NUM_PROCESSES(numProcesses)
            , alloc(_alloc){
        VERBOSE DEBUG std::cout<<"constructor pool_interface"<<std::endl;
        this->blockpools = new blockpool<T>*[numProcesses];
        for (int tid=0;tid<numProcesses;++tid) {
            this->blockpools[tid] = new blockpool<T>();
        }
    }
    ~pool_interface() {
        VERBOSE DEBUG std::cout<<"destructor pool_interface"<<std::endl;
        for (int tid=0;tid<this->NUM_PROCESSES;++tid) {
            delete this->blockpools[tid];
        }
        delete[] this->blockpools;
    }
};

#endif


================================================
FILE: datastructures/trevor_brown_abtree/common/recordmgr/pool_none.h
================================================
/**
 * C++ record manager implementation (PODC 2015) by Trevor Brown.
 * 
 * Copyright (C) 2015 Trevor Brown
 *
 */

#ifndef POOL_NOOP_H
#define	POOL_NOOP_H

#include <cassert>
#include <iostream>
#include "blockbag.h"
#include "blockpool.h"
#include "pool_interface.h"
#include "plaf.h"
using namespace std;

template <typename T = void, class Alloc = allocator_interface<T> >
class pool_none : public pool_interface<T, Alloc> {
public:
    template<typename _Tp1>
    struct rebind {
        typedef pool_none<_Tp1, Alloc> other;
    };
    template<typename _Tp1, typename _Tp2>
    struct rebind2 {
        typedef pool_none<_Tp1, _Tp2> other;
    };
    
    string getSizeString() { return "no pool"; }
    /**
     * if the freebag contains any object, then remove one from the freebag
     * and return a pointer to it.
     * if not, then retrieve a new object from Alloc
     */
    inline T* get(const int tid) {
        MEMORY_STATS2 this->alloc->debug->addFromPool(tid, 1);
        return this->alloc->allocate(tid);
    }
    inline void add(const int tid, T* ptr) {
        this->alloc->deallocate(tid, ptr);
    }
    inline void addMoveFullBlocks(const int tid, blockbag<T> *bag, block<T> * const predecessor) {
        bag->clearWithoutFreeingElements();
        // note: this will leak memory, but i believe it is only used by debraplus (which really should use a pool)
    }
    inline void addMoveFullBlocks(const int tid, blockbag<T> *bag) {
        this->alloc->deallocateAndClear(tid, bag);
//        T* ptr;
//        while (ptr = bag->remove()) {
//            add(tid, ptr);
//        }
    }
    inline void addMoveAll(const int tid, blockbag<T> *bag) {
        this->alloc->deallocateAndClear(tid, bag);
//        T* ptr;
//        while (ptr = bag->remove()) {
//            add(tid, ptr);
//        }
    }
    inline int computeSize(const int tid) {
        return 0;
    }
    
    void debugPrintStatus(const int tid) {

    }
    
    pool_none(const int numProcesses, Alloc * const _alloc, debugInfo * const _debug)
            : pool_interface<T, Alloc>(numProcesses, _alloc, _debug) {
        VERBOSE DEBUG std::cout<<"constructor pool_none"<<std::endl;
    }
    ~pool_none() {
        VERBOSE DEBUG std::cout<<"destructor pool_none"<<std::endl;
    }
};

#endif


================================================
FILE: datastructures/trevor_brown_abtree/common/recordmgr/pool_perthread_and_shared.h
================================================
/**
 * C++ record manager implementation (PODC 2015) by Trevor Brown.
 * 
 * Copyright (C) 2015 Trevor Brown
 *
 */

#ifndef POOL_PERTHREAD_AND_SHARED_H
#define	POOL_PERTHREAD_AND_SHARED_H

#include <cassert>
#include <iostream>
#include <sstream>
#include "blockbag.h"
#include "blockpool.h"
#include "pool_interface.h"
#include "plaf.h"
#include "globals.h"
using namespace std;

#define POOL_THRESHOLD_IN_BLOCKS 10

template <typename T = void, class Alloc = allocator_interface<T> >
class pool_perthread_and_shared : public pool_interface<T, Alloc> {
private:
    lockfreeblockbag<T> *sharedBag;       // shared bag that we offload blocks on when we have too many in our freeBag
    blockbag<T> **freeBag;                // freeBag[tid] = bag of objects of type T that are ready to be reused by the thread with id tid

    // note: only does something if freeBag contains at least two full blocks
    inline bool tryGiveFreeObjects(const int tid) {
        if (freeBag[tid]->getSizeInBlocks() >= POOL_THRESHOLD_IN_BLOCKS) {
            block<T> *b = freeBag[tid]->removeFullBlock(); // returns NULL if freeBag has < 2 full blocks
            assert(b);
//            if (b) {
                sharedBag->addBlock(b);
                MEMORY_STATS this->debug->addGiven(tid, 1);
                //DEBUG2 COUTATOMIC("  thread "<<this->tid<<" sharedBag("<<(sizeof(T)==sizeof(Node<long,long>)?"Node":"SCXRecord")<<") now contains "<<sharedBag->size()<<" blocks"<<std::endl);
//            }
            return true;
        }
        return false;
    }
//    
//    inline void tryTakeFreeObjects(const int tid) {
//        block<T> *b = sharedBag->getBlock();
//        if (b) {
//            freeBag[tid]->addFullBlock(b);
//            DEBUG this->debug->addTaken(tid, 1);
//            //DEBUG2 COUTATOMIC("  thread "<<this->tid<<" took "<<b->computeSize()<<" objects from sharedBag"<<std::endl);
//        }
//    }
public:
    template<typename _Tp1>
    struct rebind {
        typedef pool_perthread_and_shared<_Tp1, Alloc> other;
    };
    template<typename _Tp1, typename _Tp2>
    struct rebind2 {
        typedef pool_perthread_and_shared<_Tp1, _Tp2> other;
    };
    
//    long long getSizeInNodes() {
//        long long sum = 0;
//        for (int tid=0;tid<this->NUM_PROCESSES;++tid) {
//            sum += freeBag[tid]->computeSize();
//        }
////        sum += sharedBag->sizeInBlocks() * BLOCK_SIZE;
//        return sum;
//    }
    string getSizeString() {
        stringstream ss;
        long long insharedbag = sharedBag->size();
        long long infreebags = 0;
        for (int tid=0;tid<this->NUM_PROCESSES;++tid) {
            infreebags += freeBag[tid]->computeSize();
        }
        ss<<infreebags<<" in free bags and "<<insharedbag<<" in the shared bag";
        return ss.str();
    }
    
    /**
     * if the freebag contains any object, then remove one from the freebag
     * and return a pointer to it.
     * if not, then retrieve a new object from Alloc
     */
    inline T* get(const int tid) {
        MEMORY_STATS2 this->alloc->debug->addFromPool(tid, 1);
        return freeBag[tid]->template remove<Alloc>(tid, sharedBag, this->alloc);
    }
    inline void add(const int tid, T* ptr) {
        MEMORY_STATS2 this->debug->addToPool(tid, 1);
        freeBag[tid]->add(tid, ptr, sharedBag, POOL_THRESHOLD_IN_BLOCKS, this->alloc);
    }
    inline void addMoveFullBlocks(const int tid, blockbag<T> *bag, block<T> * const predecessor) {
        // WARNING: THE FOLLOWING DEBUG COMPUTATION GETS THE WRONG NUMBER OF BLOCKS.
        MEMORY_STATS2 this->debug->addToPool(tid, (bag->getSizeInBlocks()-1)*BLOCK_SIZE);
        freeBag[tid]->appendMoveFullBlocks(bag, predecessor);
        while (tryGiveFreeObjects(tid)) {}
    }
    inline void addMoveFullBlocks(const int tid, blockbag<T> *bag) {
        // WARNING: THE FOLLOWING DEBUG COMPUTATION GETS THE WRONG NUMBER OF BLOCKS.
        MEMORY_STATS2 this->debug->addToPool(tid, (bag->getSizeInBlocks()-1)*BLOCK_SIZE);
        freeBag[tid]->appendMoveFullBlocks(bag);
        while (tryGiveFreeObjects(tid)) {}
    }
    inline void addMoveAll(const int tid, blockbag<T> *bag) {
        MEMORY_STATS2 this->debug->addToPool(tid, bag->computeSize());
        freeBag[tid]->appendMoveAll(bag);
        while (tryGiveFreeObjects(tid)) {}
    }
    inline int computeSize(const int tid) {
        return freeBag[tid]->computeSize();
    }
    
    void debugPrintStatus(const int tid) {
//        long free = computeSize(tid);
//        long share = sharedBag->sizeInBlocks();
//        COUTATOMIC("free="<<free<<" share="<<share);
    }
    
    pool_perthread_and_shared(const int numProcesses, Alloc * const _alloc, debugInfo * const _debug)
            : pool_interface<T, Alloc>(numProcesses, _alloc, _debug) {
        VERBOSE DEBUG COUTATOMIC("constructor pool_perthread_and_shared"<<std::endl);
        freeBag = new blockbag<T>*[numProcesses];
        for (int tid=0;tid<numProcesses;++tid) {
            freeBag[tid] = new blockbag<T>(tid, this->blockpools[tid]);
        }
        sharedBag = new lockfreeblockbag<T>();
    }
    ~pool_perthread_and_shared() {
        VERBOSE DEBUG COUTATOMIC("destructor pool_perthread_and_shared"<<std::endl);
        // clean up shared bag
        const int dummyTid = 0;
        block<T> *fullBlock;
        while ((fullBlock = sharedBag->getBlock()) != NULL) {
            while (!fullBlock->isEmpty()) {
                T * const ptr = fullBlock->pop();
                this->alloc->deallocate(dummyTid, ptr);
            }
            this->blockpools[dummyTid]->deallocateBlock(fullBlock);
        }
        // clean up free bags
        for (int tid=0;tid<this->NUM_PROCESSES;++tid) {
            this->alloc->deallocateAndClear(tid, freeBag[tid]);
            delete freeBag[tid];
        }
        delete[] freeBag;
        delete sharedBag;
    }
};

#endif


================================================
FILE: datastructures/trevor_brown_abtree/common/recordmgr/reclaimer_debra.h
================================================
/**
 * C++ record manager implementation (PODC 2015) by Trevor Brown.
 * 
 * Copyright (C) 2015 Trevor Brown
 *
 */

#ifndef RECLAIM_EPOCH_H
#define	RECLAIM_EPOCH_H

#include <cassert>
#include <iostream>
#include <sstream>
#include <limits.h>
#include "blockbag.h"
#include "plaf.h"
#include "allocator_interface.h"
#include "reclaimer_interface.h"
using namespace std;


template <typename T = void, class Pool = pool_interface<T> >
class reclaimer_debra : public reclaimer_interface<T, Pool> {
protected:
#define EPOCH_INCREMENT 2
#define BITS_EPOCH(ann) ((ann)&~(EPOCH_INCREMENT-1))
#define QUIESCENT(ann) ((ann)&1)
#define GET_WITH_QUIESCENT(ann) ((ann)|1)

#ifdef RAPID_RECLAMATION
#define MIN_OPS_BEFORE_READ 1
//#define MIN_OPS_BEFORE_CAS_EPOCH 1
#else
#define MIN_OPS_BEFORE_READ 20
//#define MIN_OPS_BEFORE_CAS_EPOCH 100
#endif
    
#define NUMBER_OF_EPOCH_BAGS 9
#define NUMBER_OF_ALWAYS_EMPTY_EPOCH_BAGS 3
    
    // for epoch based reclamation
    volatile long epoch;
    atomic_long *announcedEpoch;        // announcedEpoch[tid*PREFETCH_SIZE_WORDS] // todo: figure out if volatile here would help processes notice changes more quickly.
    long *checked;                      // checked[tid*PREFETCH_SIZE_WORDS] = how far we've come in checking the announced epochs of other threads
    blockbag<T> **epochbags;            // epochbags[NUMBER_OF_EPOCH_BAGS*tid+0..NUMBER_OF_EPOCH_BAGS*tid+(NUMBER_OF_EPOCH_BAGS-1)] are epoch bags for thread tid.
    blockbag<T> **currentBag;           // pointer to current epoch bag for each process
    long *index;                        // index of currentBag in epochbags for each process
    // note: oldest bag is number (index+1)%NUMBER_OF_EPOCH_BAGS
    long *opsSinceRead;
    
public:
    template<typename _Tp1>
    struct rebind {
        typedef reclaimer_debra<_Tp1, Pool> other;
    };
    template<typename _Tp1, typename _Tp2>
    struct rebind2 {
        typedef reclaimer_debra<_Tp1, _Tp2> other;
    };
    
//    inline int getOldestBlockbagIndexOffset(const int tid) {
//        long long min_val = LLONG_MAX;
//        int min_i = -1;
//        for (int i=0;i<NUMBER_OF_EPOCH_BAGS;++i) {
//            long long reclaimCount = epochbags[tid*NUMBER_OF_EPOCH_BAGS+i]->getReclaimCount();
//            if (reclaimCount % 1) { // bag's contents are currently being freed
//                return i;
//            }
//            if (reclaimCount < min_val) {
//                min_val = reclaimCount;
//                min_i = i;
//            }
//        }
//        return min_i;
//    }
//    
//    inline set_of_bags<T> getBlockbags() { // blockbag_iterator<T> ** const output) {
////        int cnt=0;
////        for (int tid=0;tid<NUM_PROCESSES;++tid) {
////            for (int j=0;j<NUMBER_OF_EPOCH_BAGS;++j) {
////                output[cnt++] = epochbags[NUMBER_OF_EPOCH_BAGS*tid+j];
////            }
////        }
////        return cnt;
//        return {epochbags, this->NUM_PROCESSES*NUMBER_OF_EPOCH_BAGS};
//    }
//    
//    inline void getOldestTwoBlockbags(const int tid, blockbag<T> ** oldest, blockbag<T> ** secondOldest) {
//        long long min_val = LLONG_MAX;
//        int min_i = -1;
//        for (int i=0;i<NUMBER_OF_EPOCH_BAGS;++i) {
//            long long reclaimCount = epochbags[tid*NUMBER_OF_EPOCH_BAGS+i]->getReclaimCount();
//            if (reclaimCount % 1) { // bag's contents are currently being freed
//                min_i = i;
//                break;
//            }
//            if (reclaimCount < min_val) {
//                min_val = reclaimCount;
//                min_i = i;
//            }
//        }
//        if (min_i == -1) {
//            *oldest = *secondOldest = NULL;
//        } else {
//            *oldest = epochbags[tid*NUMBER_OF_EPOCH_BAGS + min_i];
//            *secondOldest = epochbags[tid*NUMBER_OF_EPOCH_BAGS + ((min_i+1)%NUMBER_OF_EPOCH_BAGS)];
//        }
//    }
    
    inline void getSafeBlockbags(const int tid, blockbag<T> ** bags) {
        SOFTWARE_BARRIER;
        int ix = index[tid*PREFETCH_SIZE_WORDS];
        bags[0] = epochbags[tid*NUMBER_OF_EPOCH_BAGS+ix];
        bags[1] = epochbags[tid*NUMBER_OF_EPOCH_BAGS+((ix+NUMBER_OF_EPOCH_BAGS-1)%NUMBER_OF_EPOCH_BAGS)];
        bags[2] = epochbags[tid*NUMBER_OF_EPOCH_BAGS+((ix+NUMBER_OF_EPOCH_BAGS-2)%NUMBER_OF_EPOCH_BAGS)];
        bags[3] = NULL;
        SOFTWARE_BARRIER;
        
//        SOFTWARE_BARRIER;
//        // find first dangerous blockbag
//        long long min_val = LLONG_MAX;
//        int min_i = -1;
//        for (int i=0;i<NUMBER_OF_EPOCH_BAGS;++i) {
//            long long reclaimCount = epochbags[tid*NUMBER_OF_EPOCH_BAGS+i]->getReclaimCount();
//            if (reclaimCount % 1) { // bag's contents are currently being freed
//                min_i = i;
//                break;
//            }
//            if (reclaimCount < min_val) {
//                min_val = reclaimCount;
//                min_i = i;
//            }
//        }
//        assert(min_i != -1);
//        min_i = (min_i + NUMBER_OF_ALWAYS_EMPTY_EPOCH_BAGS) % NUMBER_OF_EPOCH_BAGS;
//        
//        // process might free from bag at offset min_i, or the next one.
//        // the others are safe.
//        int i;
//        for (i=0;i<NUMBER_OF_EPOCH_BAGS-NUMBER_OF_UNSAFE_EPOCH_BAGS;++i) {
//            bags[i] = epochbags[tid*NUMBER_OF_EPOCH_BAGS + ((min_i + NUMBER_OF_UNSAFE_EPOCH_BAGS + i)%NUMBER_OF_EPOCH_BAGS)];
//        }
//        bags[i] = NULL; // null terminated array
//        
////        bags[0] = epochbags[tid*NUMBER_OF_EPOCH_BAGS + ((min_i + NUMBER_OF_UNSAFE_EPOCH_BAGS)%NUMBER_OF_EPOCH_BAGS)];
////        bags[1] = NULL; // null terminated array
////        bags[0] = NULL;
//
//        SOFTWARE_BARRIER; 

//        SOFTWARE_BARRIER;
//        /**
//         * find first dangerous blockbag.
//         * a process may free bag index+i+NUMBER_OF_ALWAYS_EMPTY_EPOCH_BAGS,
//         * where i=1,2,...,(NUMBER_OF_EPOCH_BAGS - NUMBER_OF_ALWAYS_EMPTY_EPOCH_BAGS).
//         * the rest are safe, and
//         * MUST contain all nodes retired in this epoch or the last.
//         */
//        int ix = (index[tid*PREFETCH_SIZE_WORDS]+1+NUMBER_OF_ALWAYS_EMPTY_EPOCH_BAGS) % NUMBER_OF_EPOCH_BAGS;
//        SOFTWARE_BARRIER;
//        int i;
//        // #safebags = total - #unsafe
//        for (i=0;i<NUMBER_OF_EPOCH_BAGS-NUMBER_OF_UNSAFE_EPOCH_BAGS;++i) {
//            // find i-th safe bag
//            int ix2 = (ix+NUMBER_OF_UNSAFE_EPOCH_BAGS+i)%NUMBER_OF_EPOCH_BAGS; // UNFINISHED CODE FROM HERE DOWN
//            bags[i] = epochbags[tid*NUMBER_OF_EPOCH_BAGS + ix2];
//        }
//        bags[i] = NULL; // null terminated array
//        SOFTWARE_BARRIER;
    }
    
    long long getSizeInNodes() {
        long long sum = 0;
        for (int tid=0;tid<this->NUM_PROCESSES;++tid) {
            for (int j=0;j<NUMBER_OF_EPOCH_BAGS;++j) {
                sum += epochbags[NUMBER_OF_EPOCH_BAGS*tid+j]->computeSize();
            }
        }
        return sum;
    }
    string getSizeString() {
        stringstream ss;
        ss<<getSizeInNodes()<<" in epoch bags";
        return ss.str();
    }
    
    inline static bool quiescenceIsPerRecordType() { return false; }
    
    inline bool isQuiescent(const int tid) {
        return QUIESCENT(announcedEpoch[tid*PREFETCH_SIZE_WORDS].load(memory_order_relaxed));
    }

    inline static bool isProtected(const int tid, T * const obj) {
        return true;
    }
    inline static bool isQProtected(const int tid, T * const obj) {
        return false;
    }
    inline static bool protect(const int tid, T * const obj, CallbackType notRetiredCallback, CallbackArg callbackArg, bool memoryBarrier = true) {
        return true;
    }
    inline static void unprotect(const int tid, T * const obj) {}
    inline static bool qProtect(const int tid, T * const obj, CallbackType notRetiredCallback, CallbackArg callbackArg, bool memoryBarrier = true) {
        return true;
    }
    inline static void qUnprotectAll(const int tid) {}
    
    inline static bool shouldHelp() { return true; }
    
    // rotate the epoch bags and reclaim any objects retired two epochs ago.
    inline void rotateEpochBags(const int tid) {
        int nextIndex = (index[tid*PREFETCH_SIZE_WORDS]+1) % NUMBER_OF_EPOCH_BAGS;
        blockbag<T> * const freeable = epochbags[NUMBER_OF_EPOCH_BAGS*tid + ((nextIndex+NUMBER_OF_ALWAYS_EMPTY_EPOCH_BAGS) % NUMBER_OF_EPOCH_BAGS)];
        this->pool->addMoveFullBlocks(tid, freeable); // moves any full blocks (may leave a non-full block behind)
        SOFTWARE_BARRIER;
        index[tid*PREFETCH_SIZE_WORDS] = nextIndex;
        currentBag[tid*PREFETCH_SIZE_WORDS] = epochbags[NUMBER_OF_EPOCH_BAGS*tid + nextIndex];
    }

    // objects reclaimed by this epoch manager.
    // returns true if the call rotated the epoch bags for thread tid
    // (and reclaimed any objects retired two epochs ago).
    // otherwise, the call returns false.
    inline bool leaveQuiescentState(const int tid, void * const * const reclaimers, const int numReclaimers) {
        SOFTWARE_BARRIER; // prevent any bookkeeping from being moved after this point by the compiler.
        bool result = false;

        // ver 1
        long readEpoch = epoch;
        const long ann = announcedEpoch[tid*PREFETCH_SIZE_WORDS].load(memory_order_relaxed);
//        // debug ver2
//        const long ann = announcedEpoch[tid*PREFETCH_SIZE_WORDS].load(memory_order_relaxed);
//        ++opsSinceRead[tid*PREFETCH_SIZE_WORDS];
//        long readEpoch = ((opsSinceRead[tid*PREFETCH_SIZE_WORDS] % MIN_OPS_BEFORE_READ) == 0) ? epoch : BITS_EPOCH(ann);

        // if our announced epoch is different from the current epoch
        if (readEpoch != BITS_EPOCH(ann)) {
            // announce the new epoch, and rotate the epoch bags and
            // reclaim any objects retired two epochs ago.
            checked[tid*PREFETCH_SIZE_WORDS] = 0;
            //rotateEpochBags(tid);
            for (int i=0;i<numReclaimers;++i) {
                ((reclaimer_debra<T, Pool> * const) reclaimers[i])->rotateEpochBags(tid);
            }
            result = true;
        }
        // note: readEpoch, when written to announcedEpoch[tid],
        //       will set the state to non-quiescent and non-neutralized

        // incrementally scan the announced epochs of all threads
        int otherTid = checked[tid*PREFETCH_SIZE_WORDS];
        if ((++opsSinceRead[tid*PREFETCH_SIZE_WORDS] % MIN_OPS_BEFORE_READ) == 0) {
            long otherAnnounce = announcedEpoch[otherTid*PREFETCH_SIZE_WORDS].load(memory_order_relaxed);
            if (BITS_EPOCH(otherAnnounce) == readEpoch
                    || QUIESCENT(otherAnnounce)) {
                const int c = ++checked[tid*PREFETCH_SIZE_WORDS];
                if (c >= this->NUM_PROCESSES /*&& c > MIN_OPS_BEFORE_CAS_EPOCH*/) {
                    __sync_bool_compare_and_swap(&epoch, readEpoch, readEpoch+EPOCH_INCREMENT);
                }
            }
        }
        SOFTWARE_BARRIER;
        if (readEpoch != ann) {
            announcedEpoch[tid*PREFETCH_SIZE_WORDS].store(readEpoch, memory_order_relaxed);
        }
        return result;
    }
    
    inline void enterQuiescentState(const int tid) {
        const long ann = announcedEpoch[tid*PREFETCH_SIZE_WORDS].load(memory_order_relaxed);
        announcedEpoch[tid*PREFETCH_SIZE_WORDS].store(GET_WITH_QUIESCENT(ann), memory_order_relaxed);
    }
    
    // for all schemes except reference counting
    inline void retire(const int tid, T* p) {
        currentBag[tid*PREFETCH_SIZE_WORDS]->add(p);
        DEBUG2 this->debug->addRetired(tid, 1);
    }
    
    inline void unretireLast(const int tid) {
        assert(false); // we do not use this, since it makes it harder to reason about iteration over blockbags when they shrink (aside from when their contents are being reclaimed, and we can determine this is the case by inspecting bag->getReclaimCount()...)
        currentBag[tid*PREFETCH_SIZE_WORDS]->remove();
    }

    void debugPrintStatus(const int tid) {
//        assert(tid >= 0);
//        assert(tid < this->NUM_PROCESSES);
        if (tid == 0) {
            std::cout<<"global epoch counter="<<epoch<<std::endl;
        }
//        long announce = announcedEpoch[tid*PREFETCH_SIZE_WORDS].load(memory_order_relaxed);
//        std::cout<<"tid="<<tid<<": announce="<<announce<<" bags(";
//        for (int i=0;i<NUMBER_OF_EPOCH_BAGS;++i) {
//            std::cout<<(i?",":"")<</*" bag"<<i<<"="<<*/epochbags[NUMBER_OF_EPOCH_BAGS*tid+i]->computeSize();
//        }
//        std::cout<<")"<<std::endl;
    }

    reclaimer_debra(const int numProcesses, Pool *_pool, debugInfo * const _debug, RecoveryMgr<void *> * const _recoveryMgr = NULL)
            : reclaimer_interface<T, Pool>(numProcesses, _pool, _debug, _recoveryMgr) {
        VERBOSE std::cout<<"constructor reclaimer_debra helping="<<this->shouldHelp()<<std::endl;// scanThreshold="<<scanThreshold<<std::endl;
        epoch = 0;
        epochbags = new blockbag<T>*[NUMBER_OF_EPOCH_BAGS*numProcesses];
        currentBag = new blockbag<T>*[numProcesses*PREFETCH_SIZE_WORDS];
        index = new long[numProcesses*PREFETCH_SIZE_WORDS];
        opsSinceRead = new long[numProcesses*PREFETCH_SIZE_WORDS];
        announcedEpoch = new atomic_long[numProcesses*PREFETCH_SIZE_WORDS];
        checked = new long[numProcesses*PREFETCH_SIZE_WORDS];
        for (int tid=0;tid<numProcesses;++tid) {
            for (int i=0;i<NUMBER_OF_EPOCH_BAGS;++i) {
                epochbags[NUMBER_OF_EPOCH_BAGS*tid+i] = new blockbag<T>(tid, this->pool->blockpools[tid]);
            }
            currentBag[tid*PREFETCH_SIZE_WORDS] = epochbags[NUMBER_OF_EPOCH_BAGS*tid];
            index[tid*PREFETCH_SIZE_WORDS] = 0;
            opsSinceRead[tid*PREFETCH_SIZE_WORDS] = 0;
            announcedEpoch[tid*PREFETCH_SIZE_WORDS].store(GET_WITH_QUIESCENT(0), memory_order_relaxed);
            checked[tid*PREFETCH_SIZE_WORDS] = 0;
        }
    }
    ~reclaimer_debra() {
        VERBOSE DEBUG std::cout<<"destructor reclaimer_debra"<<std::endl;
        for (int tid=0;tid<this->NUM_PROCESSES;++tid) {
            // move contents of all bags into pool
            for (int i=0;i<NUMBER_OF_EPOCH_BAGS;++i) {
//                std::cout<<"main thread: moving "<<epochbags[NUMBER_OF_EPOCH_BAGS*tid+i]->computeSize()<<" objects from epoch bag of tid="<<tid<<" to pool"<<std::endl;
                this->pool->addMoveAll(tid, epochbags[NUMBER_OF_EPOCH_BAGS*tid+i]);
                delete epochbags[NUMBER_OF_EPOCH_BAGS*tid+i];
            }
        }
        delete[] epochbags;
        delete[] index;
        delete[] opsSinceRead;
        delete[] currentBag;
        delete[] announcedEpoch;
        delete[] checked;
    }

};

#endif


================================================
FILE: datastructures/trevor_brown_abtree/common/recordmgr/reclaimer_debraplus.h
================================================
/**
 * C++ record manager implementation (PODC 2015) by Trevor Brown.
 * 
 * Copyright (C) 2015 Trevor Brown
 *
 */

#ifndef RECLAIM_EPOCH_CRASHRECOV_H
#define	RECLAIM_EPOCH_CRASHRECOV_H

#include <cassert>
#include <iostream>
#include "plaf.h"
#include "globals.h"
#include "blockbag.h"
#include "allocator_interface.h"
#include "reclaimer_interface.h"
#include "arraylist.h"
#include "hashtable.h"
#include "record_manager_single_type.h"
using namespace std;
using namespace hashset_namespace;

template <typename T = void, class Pool = pool_interface<T> >
class reclaimer_debraplus : public reclaimer_interface<T, Pool> {
private:
#define EPOCH_INCREMENT 2
#define BITS_EPOCH(ann) ((ann)&~(EPOCH_INCREMENT-1))
#define QUIESCENT(ann) ((ann)&1)
#define GET_WITH_QUIESCENT(ann) ((ann)|1)
// the following threshold allows a process to accumulate about 768 objects in each epoch bag
// (3*BLOCK_SIZE=768, but there are other things that inflate bag size slightly, such as
//  the fact that a thread can do n operations before it successfully neutralizes each thread
//  and can advance the epoch.)
#define NEUTRALIZE_THRESHOLD_IN_BLOCKS 4
// maximum number of objects that can be simultaneously protected by calls to qProtect()
#define MAX_PROTECT_EVEN_IF_QUIESCENT 7

#define MINIMUM_OPERATIONS_BEFORE_NEW_EPOCH_CR 100
#define NUMBER_OF_EPOCH_BAGS_CR 3
    // for epoch based reclamation
    volatile long epoch;
    atomic_long *announcedEpoch;        // announcedEpoch[tid*PREFETCH_SIZE_WORDS] = bits 1..end contain the last epoch seen by thread tid, and bit 0 indicates quiescence
    long *checked;                      // checked[tid*PREFETCH_SIZE_WORDS] = how far we've come in checking the announced epochs of other threads
    blockbag<T> **epochbags;            // epochbags[NUMBER_OF_EPOCH_BAGS*tid+0..NUMBER_OF_EPOCH_BAGS*tid+(NUMBER_OF_EPOCH_BAGS-1)] are epoch bags for thread tid.
    blockbag<T> **currentBag;           // pointer to current epoch bag for each process
    long *index;                        // index of currentBag in epochbags for each process
    // note: oldest bag is number (index+1)%NUMBER_OF_EPOCH_BAGS_CR

    // for hazard pointer component of this scheme;
    // each thread has a single hazard pointer that it uses to prevent
    // other threads from reclaiming its current scx record before it can
    // clean up after itself.
    AtomicArrayList<T> **announce;         // announce[tid] = pointer to set of hazard pointers for thread tid
    hashset_new<T> **comparing;         // comparing[tid] = set of announced hazard pointers for ALL threads, as collected by thread tid during it's last retire(tid, ...) call

    // number of blocks retired[tid] must contain before it is guaranteed to
    // contain at least 5*numProcesses*MAX_PROTECT_EVEN_IF_QUIESCENT items...
    // why 5*numProcesses*MAX_PROTECT_EVEN_IF_QUIESCENT items?
    // to get amortized constant scanning time per object,
    // the number of elements that retired[tid] must contain
    // before we scan hazard pointers to determine
    // which elements of retired[tid] can be deallocated
    // must be nk+Omega(nk), where
    //      n = number of threads and
    //      k = max number of hazard pointers a thread can hold at once
    // in this context, k=MAX_PROTECT_EVEN_IF_QUIESCENT, since a thread only obtains
    // a hazard pointer to the scx record it has most recently created, and
    // the nodes it points to. so, we just need some constant times
    // numProcesses*MAX_PROTECT_EVEN_IF_QUIESCENT.
    static const int scanThreshold = 4;

    sigset_t neutralizeSignalSet;
    
    inline bool neutralizeOther(const int tid, const int otherTid, const long currentEpoch, const long announceOther) {
#ifdef SEND_CRASH_RECOVERY_SIGNALS
        assert(isQuiescent(tid));
        assert(otherTid != tid);
        // if the epoch bag is too full, then we suspect otherTid has crashed...
        if (epochbags[NUMBER_OF_EPOCH_BAGS_CR*tid+index[tid*PREFETCH_SIZE_WORDS]]->getSizeInBlocks() >= NEUTRALIZE_THRESHOLD_IN_BLOCKS) {
            
            // neutralize otherTid by sending him a signal to make him
            // change what his next step will be, and force him to
            // throw away all pointers into the data structure, and
            // leaveQstate again before re-acquiring any pointers into
            // the data structure. this lets us reclaim memory without
            // waiting for him to progress.
            pthread_t otherPthread = this->recoveryMgr->getPthread(otherTid);
            int error = 0;
//                COUTATOMICTID("sending signal to tid "<<otherTid<<std::endl);

            if (error = pthread_kill(otherPthread, this->recoveryMgr->neutralizeSignal)) {
                // should never happen
                for (int i=0;i<20;++i) COUTATOMICTID("######################################################"<<std::endl);
                COUTATOMICTID("error "<<error<<" when trying to pthread_kill(pthread_tFor("<<otherTid<<"), "<<this->recoveryMgr->neutralizeSignal<<")"<<std::endl);
                assert(isQuiescent(tid));
                const long newann = announcedEpoch[otherTid*PREFETCH_SIZE_WORDS].load(memory_order_relaxed);
                COUTATOMICTID("otherThread has newann="<<newann<<" with quiescent bit "<<QUIESCENT(newann)<<std::endl);
                // this can happen because otherTid has terminated.
                // if otherTid is now quiescent, then we can return true...
                if (QUIESCENT(newann)) return true;
                return false;
            } else {
#ifdef AFTER_NEUTRALIZING_WAIT_FOR_QUIESCENCE
                // debug technique: wait until otherTid is
                // either quiescent or has updated its announced epoch.
                for (;;) {
                    TRACE COUTATOMICTID("thread "<<tid<<" waiting for quiescence of thread "<<otherTid<<std::endl);
                    __sync_synchronize();
                    const long newann = announcedEpoch[otherTid*PREFETCH_SIZE_WORDS].load(memory_order_relaxed);
                    if (QUIESCENT(newann) || BITS_EPOCH(newann) != BITS_EPOCH(announceOther)) {
                        return true;
                    }
                }
#endif
#ifdef AFTER_NEUTRALIZING_SET_BIT_AND_RETURN_TRUE
                assert(isQuiescent(tid));
                return true;
#endif
            }
        }
        assert(isQuiescent(tid));
#endif
        return false;
    }
    
public:
    
    template<typename _Tp1>
    struct rebind {
        typedef reclaimer_debraplus<_Tp1, Pool> other;
    };
    template<typename _Tp1, typename _Tp2>
    struct rebind2 {
        typedef reclaimer_debraplus<_Tp1, _Tp2> other;
    };
    
    inline static bool quiescenceIsPerRecordType() { return false; }
    inline static bool supportsCrashRecovery() { return true; }
    inline bool isQuiescent(const int tid) {
        //COUTATOMICTID("IS QUIESCENT EXECUTED"<<std::endl);
        return QUIESCENT(announcedEpoch[tid*PREFETCH_SIZE_WORDS].load(memory_order_relaxed));
    }
    
    inline static bool isProtected(const int tid, T * const obj) {
        return true;
    }
    
    inline static bool protect(const int tid, T * const obj, CallbackType notRetiredCallback, CallbackArg callbackArg, bool memoryBarrier = true) {
        return true;
    }
    inline static void unprotect(const int tid, T * const obj) {}
    
    inline bool isQProtected(const int tid, T * const obj) {
        return announce[tid]->contains(obj); // this is inefficient, but should only happen when recovering from being neutralized...
    }
    inline bool qProtect(const int tid, T * const obj, CallbackType notRetiredCallback, CallbackArg callbackArg, bool memoryBarrier = true) {
        TRACE COUTATOMICTID("reclaimer_debraplus::protectObjectEvenIfQuiescent(tid="<<tid<</*", "<<*obj<<*/")"<<std::endl);
        int __size; DEBUG __size = announce[tid]->size();
        DEBUG assert(__size < MAX_PROTECT_EVEN_IF_QUIESCENT);
        announce[tid]->add(obj);
        assert(announce[tid]->contains(obj));
        DEBUG assert(announce[tid]->size() == __size+1);
        // if callbackArg = NULL, we assume notRetiredCallback is a noop.
        if (memoryBarrier) __sync_synchronize(); // prevent retired from being read before we set a hazard pointer to obj, and prevent any future reads of fields of obj from being moved before we announce obj.
        if (notRetiredCallback(callbackArg)) {
            TRACE COUTATOMICTID("notRetiredCallback returns true"<<std::endl);
            return true;
        } else {
            // obj MAY be retired
            TRACE COUTATOMICTID("notRetiredCallback returns false"<<std::endl);
            // although we don't care about other threads being able to free
            // this object for efficiency, we still need to null this out
            // because we need to be able to tell whether we successfully
            // protected this when we invoke isProtectedEvenIfQuiescent
            // (otherwise, crash recovery is hard to do)
            announce[tid]->erase(obj); // note: this is inefficient, but it should never happen with regular use.
            DEBUG assert(__size == announce[tid]->size());
            return false;
        }
    }
    inline void qUnprotectAll(const int tid) {
        TRACE COUTATOMICTID("reclaimer_debraplus::unprotectAllObjectsEvenIfQuiescent(tid="<<tid<<")"<<std::endl);
        assert(isQuiescent(tid));
        announce[tid]->clear();
        assert(announce[tid]->size() == 0);
    }
    
    // rotate the epoch bags and reclaim any objects retired two epochs ago.
    inline void rotateEpochBags(const int tid) {
        assert(isQuiescent(tid));
        // we rotate lists in constant time, and scan hazard pointers
        // when the blockbag from two epochs ago is larger than scanThreshold
        // (using an iterator with erase functionality).
        // maybe in the future we could use bloom filters somehow to determine when no hazard pointer
        // can be present in a block, so we can reclaim the entire block in O(k) time...???
        // (if we're willing to accept k full, unreclaimable blocks per thread, then we can avoid
        //  working with individual elements altogether. we can simply check if each HP is in the bloom
        //  filter for each of c*n blocks (for some constant c), and have some probability of being
        //  able to reclaim (c-1)*n blocks. then, this procedure will be worst case O(n) time.)

        index[tid*PREFETCH_SIZE_WORDS] = (index[tid*PREFETCH_SIZE_WORDS]+1) % NUMBER_OF_EPOCH_BAGS_CR;
        blockbag<T> * const freeable = epochbags[NUMBER_OF_EPOCH_BAGS_CR*tid+index[tid*PREFETCH_SIZE_WORDS]];
        if (freeable->getSizeInBlocks() >= scanThreshold) {

            TRACE COUTATOMICTID("retiring... we have "<<freeable->computeSize()<<" things waiting to be retired in this epoch bag...");
            // hash all announcements
            comparing[tid]->clear();
            assert(comparing[tid]->size() == 0);
            for (int otherTid=0; otherTid < this->NUM_PROCESSES; ++otherTid) {
                int sz = announce[otherTid]->size();
                for (int ixHP = 0; ixHP < sz; ++ixHP) {
                    T* hp = (T*) announce[otherTid]->get(ixHP);
                    if (hp) {
                        int oldSize; DEBUG2 oldSize = comparing[tid]->size();
                        comparing[tid]->insert((T*) hp);
                        DEBUG2 assert(comparing[tid]->size() <= oldSize + 1); // might not increase size if comparing[tid] already contains this item...
                    }
                }
            }
            
            // check if any nodes (from two epochs ago) are announced (qprotected)
            // and swap them to the front of the blockbag.
            // once all announced nodes are at the front of the blockbag,
            // we can free whole blocks in the remainder of the blockbag.
            blockbag_iterator<T> it = freeable->begin();
            blockbag_iterator<T> nextswap = freeable->begin();
            while (it != freeable->end()) {
                if (comparing[tid]->contains(*it)) {
                    // a hazard pointers points to the item
                    it.swap(nextswap.getCurr(), nextswap.getIndex());
                    nextswap++;
                }
                it++;
            }
            block<T> * const curr = nextswap.getCurr();
            if (curr) {
                this->pool->addMoveFullBlocks(tid, freeable, curr);
            }
        }
        
        currentBag[tid*PREFETCH_SIZE_WORDS] = freeable;
        assert(isQuiescent(tid));
    }
    
    // invoke this at the beginning of each operation that accesses
    // objects reclaimed by this epoch manager.
    // returns true if the call rotated the epoch bags for thread tid
    // (and reclaimed any objects retired two epochs ago).
    // otherwise, the call returns false.
    // IMPLIES A FULL MEMORY BARRIER
    inline bool leaveQuiescentState(const int tid, void * const * const reclaimers, const int numReclaimers) {
        SOFTWARE_BARRIER; // prevent any bookkeeping from being moved after this point by the compiler.
        bool result = false;
        long readEpoch = epoch; // multiple of EPOCH_INCREMENT
        assert(!QUIESCENT(readEpoch));
        // if our announced epoch is different from the current epoch
        const long ann = announcedEpoch[tid*PREFETCH_SIZE_WORDS].load(memory_order_relaxed);
        DEBUG2 if (!QUIESCENT(ann)) {
            COUTATOMICTID("NOT QUIESCENT"<<std::endl);
            exit(-1);
        }
        if (readEpoch != BITS_EPOCH(ann)) {
            // announce the new epoch, and rotate the epoch bags and
            // reclaim any objects retired two epochs ago.
            checked[tid*PREFETCH_SIZE_WORDS] = 0;
            //rotateEpochBags(tid);
            for (int i=0;i<numReclaimers;++i) {
                ((reclaimer_debraplus<T, Pool> * const) reclaimers[i])->rotateEpochBags(tid);
            }
            result = true;
        }
        // note: readEpoch, when written to announcedEpoch[tid],
        //       will set the state to non-quiescent and non-neutralized
        
        // incrementally scan the announced epochs of all threads
        int otherTid = checked[tid*PREFETCH_SIZE_WORDS];
        if (otherTid >= this->NUM_PROCESSES) {
            const int c = ++checked[tid*PREFETCH_SIZE_WORDS];
            if (c > MINIMUM_OPERATIONS_BEFORE_NEW_EPOCH_CR) {
                __sync_bool_compare_and_swap(&epoch, readEpoch, readEpoch+EPOCH_INCREMENT);
            }
        } else {
            assert(otherTid >= 0);
            long otherAnnounce = announcedEpoch[otherTid*PREFETCH_SIZE_WORDS].load(memory_order_relaxed);
            if (BITS_EPOCH(otherAnnounce) == readEpoch
                    || QUIESCENT(otherAnnounce)
                    || neutralizeOther(tid, otherTid, readEpoch, otherAnnounce)) {
                const int c = ++checked[tid*PREFETCH_SIZE_WORDS];
                if (c >= this->NUM_PROCESSES && c > MINIMUM_OPERATIONS_BEFORE_NEW_EPOCH_CR) {
                    __sync_bool_compare_and_swap(&epoch, readEpoch, readEpoch+EPOCH_INCREMENT);
                }
            }
        }
        // it is important that we set the announcedEpoch last, because we must
        // not be neutralized during some of the preceding steps, or we may
        // corrupt the data structure.
        // (on x86/64, writes are not moved earlier in program order, so we don't need any membar before this write.)
        // (on another arch, we'd have to prevent this write from being moved before the write to checked[].)
        assert(isQuiescent(tid));
        SOFTWARE_BARRIER;
        announcedEpoch[tid*PREFETCH_SIZE_WORDS].store(readEpoch, memory_order_relaxed);
        return result;
    }
    // IN A SCHEME THAT SUPPORTS CRASH RECOVERY, THIS IMPLIES A FULL MEMORY BARRIER IFF THIS MOVES THE THREAD FROM AN ACTIVE STATE TO A QUIESCENT STATE
    inline void enterQuiescentState(const int tid) {
        const long ann = announcedEpoch[tid*PREFETCH_SIZE_WORDS].load(memory_order_relaxed);
        announcedEpoch[tid*PREFETCH_SIZE_WORDS].store(GET_WITH_QUIESCENT(ann), memory_order_relaxed);
        assert(isQuiescent(tid));
    }
    
    // for all schemes except reference counting
    inline void retire(const int tid, T* p) {
        assert(isQuiescent(tid));
        currentBag[tid*PREFETCH_SIZE_WORDS]->add(p);
        DEBUG2 this->debug->addRetired(tid, 1);
    }

    void debugPrintStatus(const int tid) {
//        assert(tid >= 0);
//        assert(tid < this->NUM_PROCESSES);
//        long announce = BITS_EPOCH(announcedEpoch[tid*PREFETCH_SIZE_WORDS].load(memory_order_relaxed))/EPOCH_INCREMENT;
//        std::cout<<"announce="<<announce;
//        std::cout<<" bags:";
//        for (int i=0;i<NUMBER_OF_EPOCH_BAGS_CR;++i) {
//            std::cout<<" bag"<<i<<"="<<epochbags[NUMBER_OF_EPOCH_BAGS_CR*tid+i]->computeSize();
//        }
    }

    reclaimer_debraplus(const int numProcesses, Pool *_pool, debugInfo * const _debug, RecoveryMgr<void *> * const _recoveryMgr = NULL)
            : reclaimer_interface<T, Pool>(numProcesses, _pool, _debug, _recoveryMgr) {
        VERBOSE DEBUG COUTATOMIC("constructor reclaimer_debraplus helping="<<this->shouldHelp()<<std::endl);// scanThreshold="<<scanThreshold<<std::endl;
        if (_recoveryMgr) COUTATOMIC("SIGRTMIN="<<SIGRTMIN<<" neutralizeSignal="<<this->recoveryMgr->neutralizeSignal<<std::endl);
        // set up signal set for neutralize signal
        if (sigemptyset(&neutralizeSignalSet)) {
            COUTATOMIC("error creating empty signal set"<<std::endl);
            exit(-1);
        }
        if (_recoveryMgr) {
            if (sigaddset(&neutralizeSignalSet, this->recoveryMgr->neutralizeSignal)) {
                COUTATOMIC("error adding signal to signal set"<<std::endl);
                exit(-1);
            }
        }
        
        // all other initialization and allocation
        epoch = 0;
        epochbags = new blockbag<T>*[NUMBER_OF_EPOCH_BAGS_CR*numProcesses];
        currentBag = new blockbag<T>*[numProcesses*PREFETCH_SIZE_WORDS];
        index = new long[numProcesses*PREFETCH_SIZE_WORDS];
        announcedEpoch = new atomic_long[numProcesses*PREFETCH_SIZE_WORDS];
        checked = new long[numProcesses*PREFETCH_SIZE_WORDS];
        announce = new AtomicArrayList<T>*[numProcesses];
        comparing = new hashset_new<T>*[numProcesses];
        for (int tid=0;tid<numProcesses;++tid) {
            for (int i=0;i<NUMBER_OF_EPOCH_BAGS_CR;++i) {
                epochbags[NUMBER_OF_EPOCH_BAGS_CR*tid+i] = new blockbag<T>(tid, this->pool->blockpools[tid]);
            }
            currentBag[tid*PREFETCH_SIZE_WORDS] = epochbags[NUMBER_OF_EPOCH_BAGS_CR*tid];
            index[tid*PREFETCH_SIZE_WORDS] = 0;
            announcedEpoch[tid*PREFETCH_SIZE_WORDS].store(GET_WITH_QUIESCENT(0), memory_order_relaxed);
            checked[tid*PREFETCH_SIZE_WORDS] = 0;
            announce[tid] = new AtomicArrayList<T>(MAX_PROTECT_EVEN_IF_QUIESCENT);
            comparing[tid] = new hashset_new<T>(numProcesses*MAX_PROTECT_EVEN_IF_QUIESCENT);
        }
    }
    ~reclaimer_debraplus() {
        VERBOSE DEBUG COUTATOMIC("destructor reclaimer_debraplus"<<std::endl);
        for (int tid=0;tid<this->NUM_PROCESSES;++tid) {
            // move contents of all bags into pool
            for (int i=0;i<NUMBER_OF_EPOCH_BAGS_CR;++i) {
                this->pool->addMoveAll(tid, epochbags[NUMBER_OF_EPOCH_BAGS_CR*tid+i]);
                delete epochbags[NUMBER_OF_EPOCH_BAGS_CR*tid+i];
            }
            delete comparing[tid];
            delete announce[tid];
        }
        delete[] announce;
        delete[] epochbags;
        delete[] index;
        delete[] currentBag;
        delete[] announcedEpoch;
        delete[] checked;
        delete[] comparing;
    }

};

#endif


================================================
FILE: datastructures/trevor_brown_abtree/common/recordmgr/reclaimer_hazardptr.h
================================================
/**
 * C++ record manager implementation (PODC 2015) by Trevor Brown.
 * 
 * Copyright (C) 2015 Trevor Brown
 *
 */

#ifndef RECLAIM_HAZARDPTR_STACK_H
#define	RECLAIM_HAZARDPTR_STACK_H

#include <cassert>
#include <iostream>
#include <sstream>
#include <string>
#include "blockbag.h"
#include "plaf.h"
#include "allocator_interface.h"
#include "hashtable.h"
#include "reclaimer_interface.h"
#include "arraylist.h"
using namespace std;
using namespace hashset_namespace;

#define MAX_HAZARDPTRS_PER_THREAD 16

template <typename T = void, class Pool = pool_interface<T> >
class reclaimer_hazardptr : public reclaimer_interface<T, Pool> {
private:
    AtomicArrayList<T> **announce;  // announce[tid] = set of announced hazard pointers for thread tid
    ArrayList<T> **retired;         // retired[tid] = set of retired objects for thread tid
    hashset_new<T> **comparing;     // comparing[tid] = set of announced hazard pointers for ALL threads, as collected by thread tid during it's last retire(tid, ...) call
    
    // number of elements that retired[tid] must contain
    // before we scan hazard pointers to determine
    // which elements of retired[tid] can be deallocated.
    // to get amortized constant scanning time per object,
    // this must be nk+Omega(nk), where
    //      n = number of threads and
    //      k = max number of hazard pointers a thread can hold at once
    const int scanThreshold;
    
public:
    template<typename _Tp1>
    struct rebind {
        typedef reclaimer_hazardptr<_Tp1, Pool> other;
    };
    template<typename _Tp1, typename _Tp2>
    struct rebind2 {
        typedef reclaimer_hazardptr<_Tp1, _Tp2> other;
    };
    
    inline static bool shouldHelp() {
        return false;
    }
    
    bool isProtected(const int tid, T * const obj) {
        return announce[tid]->contains(obj);
    }
    bool static isQProtected(const int tid, T * const obj) {
        return false;
    }
    inline static bool isQuiescent(const int tid) {
        return true;
    }    
    
    // for hazard pointers (and counting references from threads)
    inline bool protect(const int tid, T * const obj, CallbackType notRetiredCallback, CallbackArg callbackArg, bool memoryBarrier = true) {
        TRACE std::cout<<"reclaimer_hazardptr::protect(tid="<<tid<<", "<<debugPointerOutput(obj)<<")"<<std::endl;
        int size; DEBUG2 size = announce[tid]->size();
//        DEBUG if (sizeof(T) < 80 /* is a node */) assert(!announce[tid]->contains(obj));
        announce[tid]->add(obj);
        if (memoryBarrier) __sync_synchronize(); // prevent retired from being read before we set a hazard pointer to obj
        DEBUG2 assert(isProtected(tid, obj)); //announce[tid]->contains(obj));
        DEBUG2 assert(size + 1 == announce[tid]->size());
//        SOFTWARE_BARRIER;
        if (notRetiredCallback(callbackArg)) {
//            SOFTWARE_BARRIER;
            TRACE std::cout<<"notRetiredCallback returns true"<<std::endl;
            DEBUG2 assert(announce[tid]->size() <= MAX_HAZARDPTRS_PER_THREAD);
            DEBUG2 assert(isProtected(tid, obj));
//            SOFTWARE_BARRIER;
            assert(isProtected(tid, obj));
            return true;
        } else {
            TRACE std::cout<<"notRetiredCallback returns false"<<std::endl;
            unprotect(tid, obj); // note: it is unnecessary to unprotect here if we promise to enter a quiescent state as soon as we fail to protect an object.
//            DEBUG if (sizeof(T) < 80 /* is a node */) assert(!isProtected(tid, obj));
//            SOFTWARE_BARRIER;
            return false;
        }
    }
    inline void unprotect(const int tid, T * const obj) {
        TRACE std::cout<<"reclaimer_hazardptr::unprotect(tid="<<tid<<", "<<debugPointerOutput(obj)<<")"<<std::endl;
//        SOFTWARE_BARRIER;
        DEBUG2 assert(isProtected(tid, obj));
        int size; DEBUG2 size = announce[tid]->size();
        announce[tid]->erase(obj);
//        DEBUG if (sizeof(T) < 80 /* is a node */) assert(!announce[tid]->contains(obj));
        DEBUG2 assert(size - 1 == announce[tid]->size());
//        SOFTWARE_BARRIER;
    }
    inline bool qProtect(const int tid, T * const obj, CallbackType notRetiredCallback, CallbackArg callbackArg, bool memoryBarrier = true) {
        TRACE std::cout<<"reclaimer_debraplus::qProtect(tid="<<tid<</*", "<<*obj<<*/")"<<std::endl;
        return false;
    }
    inline void qUnprotectAll(const int tid) {
        TRACE std::cout<<"reclaimer_debraplus::qUnprotectAll(tid="<<tid<<")"<<std::endl;
    }
    
    // for epoch based reclamation
    inline void enterQuiescentState(const int tid) {
        TRACE std::cout<<"reclaimer_hazardptr::enterQuiescentState(tid="<<tid<<")"<<std::endl;
//        SOFTWARE_BARRIER;
        announce[tid]->clear();
//        __sync_synchronize();
//        announce[tid]->clearWithoutFreeingElements();
        DEBUG2 assert(announce[tid]->size() == 0);
        DEBUG2 assert(announce[tid]->isEmpty());
//        SOFTWARE_BARRIER;
    }
    inline static bool leaveQuiescentState(const int tid, void * const * const reclaimers, const int numReclaimers) {
        TRACE std::cout<<"reclaimer_hazardptr::leaveQuiescentState(tid="<<tid<<")"<<std::endl;
//        SOFTWARE_BARRIER;
        return false;
    }
    inline static void rotateEpochBags(const int tid) {}
    
    string debugPointerOutput(T* p) {
        long x = (long) p;
        ostringstream os;
        const int base = 10+26+26;
        while (x > 0) {
            int c = x % base;
            if (c < 10) os<<(char)(c+(int)'0');
            else if (c < 10+26) os<<(char)(c-10+(int)'a');
            else os<<(char)(c-10-26+(int)'A');
            x /= base;
        }
        return os.str();
    }
    
    inline void retire(const int tid, T* p) {
        TRACE std::cout<<"reclaimer_hazardptr::retire(tid="<<tid<<", "<<debugPointerOutput(p)<<")"<<std::endl;
        DEBUG2 this->debug->addRetired(tid, 1);
        retired[tid]->add(p);
        
        // if the retired bag is sufficiently large
        if (retired[tid]->isFull()) {
//            __sync_synchronize(); // not necessary, since there is a membar implied by the update cas between here and the marked bit that makes the retired predicate return true... (it follows that the retired predicate for a node u will see marked and return true if it executes when we are performing retire(u).)
            
//            TRACE std::cout<<"retiring... we have "<<retired[tid]->size()<<" things waiting to be retired (#hps="<<announce[tid]->size()<<")...";
//            // hash all announcements
//            int totalSize = 0;
//            int sizes[MAX_TID_POW2];
//            for (int otherTid=0; otherTid < this->NUM_PROCESSES; ++otherTid) {
//                sizes[otherTid] = announce[tid]->size();
//                totalSize += sizes[otherTid];
//            }
//            hashset_new<T> hset = hashset_new<T>(totalSize);
//            for (int otherTid=0; otherTid < this->NUM_PROCESSES; ++otherTid) {
//                for (int i=0;i<sizes[tid];++i) {
//                    hset.insert(announce[tid]->get(i));
//                }
//            }
//            
//            // iterate over all items in retired[tid]
//            TRACE std::cout<<"retiring... we have "<<retired[tid]->size()<<" things waiting to be retired (#hps="<<announce[tid]->size()<<", totalSize="<<totalSize<<")...";
//            for (int ix=0;ix<retired[tid]->size();++ix) {
//                TRACE std::cout<<" "<<debugPointerOutput(retired[tid]->get(ix))<<"="<<(hset.contains(retired[tid]->get(ix))?"1":"0");
//                if (!hset.contains(retired[tid]->get(ix))) {
//                    // no hazard pointers point to the item, so we send it to the pool
//                    this->pool->add(tid, retired[tid]->get(ix));
//                    // now we remove the item from retired[tid] and
//                    // adjust ix to continue where we left off
//                    retired[tid]->erase(ix);
//                    --ix;
//                }
//            }
//            TRACE std::cout<<"    afterwards, we have "<<retired[tid]->size()<<" things waiting to be retired..."<<std::endl;

//            TRACE std::cout<<"retiring... we have "<<retired[tid]->size()<<" things waiting to be retired (THIS thread #hps="<<announce[tid]->size()<<")...";
//            for (int ix=0;ix<retired[tid]->size();) {
//                // check if retired[tid]->data[ix] is in any set of hazard pointers
//                bool found = false;
//                for (int otherTid=0;otherTid<this->NUM_PROCESSES;++otherTid) {
//                    int sz = announce[otherTid]->size();
//                    for (int ixHP=0;ixHP<sz;++ixHP) {
//                        if (retired[tid]->get(ix) == announce[otherTid]->get(ixHP)) {
//                            found = true;
//                            // break out of both loops
//                            otherTid = this->NUM_PROCESSES;
//                            break;
//                        }
//                    }
//                }
//                if (!found) {
//                    // no hazard pointers point to the item, so we send it to the pool
//                    this->pool->add(tid, retired[tid]->get(ix));
//                    // now we remove the item from retired[tid]
//                    retired[tid]->erase(ix);
//                } else {
//                    ++ix; // we didn't erase, so we need to move on to the next element
//                }
//            }
//            TRACE std::cout<<"    afterwards, we have "<<retired[tid]->size()<<" things waiting to be retired..."<<std::endl;

            TRACE std::cout<<"retiring... we have "<<retired[tid]->size()<<" things waiting to be retired (THIS thread #hps="<<announce[tid]->size()<<")...";
            // hash all announcements
            comparing[tid]->clear();
            assert(comparing[tid]->size() == 0);
            for (int otherTid=0; otherTid < this->NUM_PROCESSES; ++otherTid) {
                int sz = announce[otherTid]->size();
                assert(sz < MAX_HAZARDPTRS_PER_THREAD);
                for (int ixHP=0;ixHP<sz;++ixHP) {
                    int oldSize; DEBUG2 oldSize = comparing[tid]->size();
                    comparing[tid]->insert(announce[otherTid]->get(ixHP));
                    DEBUG2 assert(comparing[tid]->size() <= oldSize + 1); // might not increase size if comparing[tid] already contains this item...
                }
            }
            for (int ix=0;ix<retired[tid]->size();) {
                // check if retired[tid]->data[ix] is in any set of hazard pointers
                if (!comparing[tid]->contains(retired[tid]->get(ix))) {
                    // no hazard pointers point to the item, so we send it to the pool
                    this->pool->add(tid, retired[tid]->get(ix));
                    // now we remove the item from retired[tid]
                    retired[tid]->erase(ix);
                } else {
                    ++ix; // we didn't erase, so we need to move on to the next element
                }
            }
            TRACE std::cout<<"    afterwards, we have "<<retired[tid]->size()<<" things waiting to be retired..."<<std::endl;
            
            DEBUG2 assert(!retired[tid]->isFull());
        }
    }

    void debugPrintStatus(const int tid) {
//        assert(tid >= 0);
//        assert(tid < this->NUM_PROCESSES);
    }

    reclaimer_hazardptr(const int numProcesses, Pool *_pool, debugInfo * const _debug, RecoveryMgr<void *> * const _recoveryMgr = NULL)
            : scanThreshold(5*numProcesses*MAX_HAZARDPTRS_PER_THREAD),
              reclaimer_interface<T, Pool>(numProcesses, _pool, _debug, _recoveryMgr) {
        VERBOSE DEBUG std::cout<<"constructor reclaimer_hazardptr"<<std::endl;
        announce = new AtomicArrayList<T>*[numProcesses];
        retired = new ArrayList<T>*[numProcesses];
        comparing = new hashset_new<T>*[numProcesses];
        for (int tid=0;tid<numProcesses;++tid) {
            announce[tid] = new AtomicArrayList<T>(MAX_HAZARDPTRS_PER_THREAD);
            retired[tid] = new ArrayList<T>(scanThreshold);
            comparing[tid] = new hashset_new<T>(numProcesses*MAX_HAZARDPTRS_PER_THREAD);
        }
    }
    ~reclaimer_hazardptr() {
        VERBOSE DEBUG std::cout<<"destructor reclaimer_hazardptr"<<std::endl;
        for (int tid=0;tid<this->NUM_PROCESSES;++tid) {
            int sz = retired[tid]->size();
            for (int ix=0;ix<sz;++ix) {
                this->pool->add(tid, retired[tid]->get(ix));
            }
            delete announce[tid];
            delete retired[tid];
            delete comparing[tid];
        }
        delete[] announce;
        delete[] retired;
        delete[] comparing;
    }

}; // end class

#endif


================================================
FILE: datastructures/trevor_brown_abtree/common/recordmgr/reclaimer_interface.h
================================================
/**
 * C++ record manager implementation (PODC 2015) by Trevor Brown.
 * 
 * Copyright (C) 2015 Trevor Brown
 *
 */

#ifndef RECLAIM_INTERFACE_H
#define	RECLAIM_INTERFACE_H

#include "recovery_manager.h"
#include "pool_interface.h"
#include "globals.h"
#include <iostream>
#include <cstdlib>
using namespace std;

template <typename T>
struct set_of_bags {
    blockbag<T> * const * const bags;
    const int numBags;
};

template <typename T = void, class Pool = pool_interface<T> >
class reclaimer_interface {
public:
#ifndef __CYGWIN__
    RecoveryMgr<void *> * recoveryMgr;
#endif
    debugInfo * const debug;
    
    const int NUM_PROCESSES;
    Pool *pool;

    template<typename _Tp1>
    struct rebind {
        typedef reclaimer_interface<_Tp1, Pool> other;
    };
    template<typename _Tp1, typename _Tp2>
    struct rebind2 {
        typedef reclaimer_interface<_Tp1, _Tp2> other;
    };
    
    long long getSizeInNodes() { return 0; }
    string getSizeString() { return ""; }

    inline static bool quiescenceIsPerRecordType() { return true; }
    inline static bool shouldHelp() { return true; } // FOR DEBUGGING PURPOSES
    inline static bool supportsCrashRecovery() { return false; }
    inline bool isProtected(const int tid, T * const obj);
    inline bool isQProtected(const int tid, T * const obj);
    inline static bool isQuiescent(const int tid) {
        COUTATOMICTID("reclaimer_interface::isQuiescent(tid) is not implemented!"<<std::endl);
        exit(-1);
    }
    
    // for hazard pointers (and reference counting)
    inline bool protect(const int tid, T* obj, CallbackType notRetiredCallback, CallbackArg callbackArg, bool memoryBarrier = true);
    inline void unprotect(const int tid, T* obj);
    inline bool qProtect(const int tid, T* obj, CallbackType notRetiredCallback, CallbackArg callbackArg, bool memoryBarrier = true);
    inline void qUnprotectAll(const int tid);
    
    // for epoch based reclamation (or, more generally, any quiescent state based reclamation)
//    inline long readEpoch();
//    inline long readAnnouncedEpoch(const int tid);
    /**
     * enterQuiescentState<T> must be idempotent,
     * and must unprotect all objects protected by calls to protectObject<T>.
     * it must NOT unprotect any object protected by a call to
     * protectObjectEvenAfterRestart.
     */
    inline void enterQuiescentState(const int tid);
    inline bool leaveQuiescentState(const int tid, void * const * const reclaimers, const int numReclaimers);
    inline void rotateEpochBags(const int tid);

    // for all schemes except reference counting
    inline void retire(const int tid, T* p);
    inline void unretireLast(const int tid) {}
    
    inline void initThread(const int tid) {}
    inline void deinitThread(const int tid) {}
    void debugPrintStatus(const int tid);
    
    reclaimer_interface(const int numProcesses, Pool *_pool, debugInfo * const _debug, RecoveryMgr<void *> * const _recoveryMgr = NULL)
#ifndef __CYGWIN__
            : recoveryMgr(_recoveryMgr)
#endif
            , debug(_debug)
            , NUM_PROCESSES(numProcesses)
            , pool(_pool) {
        VERBOSE DEBUG COUTATOMIC("constructor reclaimer_interface"<<std::endl);
    }
    ~reclaimer_interface() {
        VERBOSE DEBUG COUTATOMIC("destructor reclaimer_interface"<<std::endl);
    }
};

#endif


================================================
FILE: datastructures/trevor_brown_abtree/common/recordmgr/reclaimer_none.h
================================================
/**
 * C++ record manager implementation (PODC 2015) by Trevor Brown.
 * 
 * Copyright (C) 2015 Trevor Brown
 *
 */

#ifndef RECLAIM_NOOP_H
#define	RECLAIM_NOOP_H

#include <cassert>
#include <iostream>
#include "pool_interface.h"
#include "reclaimer_interface.h"
using namespace std;

template <typename T = void, class Pool = pool_interface<T> >
class reclaimer_none : public reclaimer_interface<T, Pool> {
private:
public:
    template<typename _Tp1>
    struct rebind {
        typedef reclaimer_none<_Tp1, Pool> other;
    };
    template<typename _Tp1, typename _Tp2>
    struct rebind2 {
        typedef reclaimer_none<_Tp1, _Tp2> other;
    };
    
    string getSizeString() { return "no reclaimer"; }
    inline static bool shouldHelp() {
        return true;
    }
    
    inline static bool isQuiescent(const int tid) {
        return true;
    }
    inline static bool isProtected(const int tid, T * const obj) {
        return true;
    }
    inline static bool isQProtected(const int tid, T * const obj) {
        return false;
    }
    
    // for hazard pointers (and reference counting)
    inline static bool protect(const int tid, T * const obj, CallbackType notRetiredCallback, CallbackArg callbackArg, bool memoryBarrier = true) {
        return true;
    }
    inline static void unprotect(const int tid, T * const obj) {}
    inline static bool qProtect(const int tid, T * const obj, CallbackType notRetiredCallback, CallbackArg callbackArg, bool memoryBarrier = true) {
        return true;
    }
    inline static void qUnprotectAll(const int tid) {}
    
    // rotate the epoch bags and reclaim any objects retired two epochs ago.
    inline static void rotateEpochBags(const int tid) {
    }
    // invoke this at the beginning of each operation that accesses
    // objects reclaimed by this epoch manager.
    // returns true if the call rotated the epoch bags for thread tid
    // (and reclaimed any objects retired two epochs ago).
    // otherwise, the call returns false.
    inline static bool leaveQuiescentState(const int tid, void * const * const reclaimers, const int numReclaimers) {
        return false;
    }
    inline static void enterQuiescentState(const int tid) {
    }
    
    // for all schemes except reference counting
    inline static void retire(const int tid, T* p) {
    }

    void debugPrintStatus(const int tid) {
    }

//    set_of_bags<T> getBlockbags() {
//        set_of_bags<T> empty = {.bags = NULL, .numBags = 0};
//        return empty;
//    }
//    
//    void getOldestTwoBlockbags(const int tid, blockbag<T> ** oldest, blockbag<T> ** secondOldest) {
//        *oldest = *secondOldest = NULL;
//    }
//    
//    int getOldestBlockbagIndexOffset(const int tid) {
//        return -1;
//    }
    
    void getSafeBlockbags(const int tid, blockbag<T> ** bags) {
        bags[0] = NULL;
    }
    
    reclaimer_none(const int numProcesses, Pool *_pool, debugInfo * const _debug, RecoveryMgr<void *> * const _recoveryMgr = NULL)
            : reclaimer_interface<T, Pool>(numProcesses, _pool, _debug, _recoveryMgr) {
        VERBOSE DEBUG std::cout<<"constructor reclaimer_none"<<std::endl;
    }
    ~reclaimer_none() {
        VERBOSE DEBUG std::cout<<"destructor reclaimer_none"<<std::endl;
    }

}; // end class

#endif


================================================
FILE: datastructures/trevor_brown_abtree/common/recordmgr/reclaimer_rcu.h
================================================
/**
 * C++ record manager implementation (PODC 2015) by Trevor Brown.
 * 
 * Copyright (C) 2015 Trevor Brown
 *
 */

// NOTE: this reclaimer can ONLY be used with allocator_new,
//       and cannot be used with any pool!

#ifndef RECLAIM_RCU_H
#define	RECLAIM_RCU_H

#include <cassert>
#include <iostream>
#include <sstream>
#include <urcu.h>
#include "blockbag.h"
#include "plaf.h"
#ifdef USE_DEBUGCOUNTERS
    #include "debugcounter.h"
#endif
#include "allocator_interface.h"
#include "reclaimer_interface.h"
#ifdef BST
    #include "node.h"
    #include "scxrecord.h"
#elif defined KCAS_MAXK
    #include "kcas.h"
#else
    #error ONLY SUPPORTS BST(main.cpp) and KCAS(ubench.cpp)
#endif
using namespace std;

#include <cstddef>

template <typename T, typename M> M get_member_type(M T::*);
template <typename T, typename M> T get_class_type(M T::*);

template <typename T,
          typename R,
          R T::*M
         >
constexpr std::size_t offset_of()
{
    return reinterpret_cast<std::size_t>(&(((T*)0)->*M));
}

#define OFFSET_OF(m) offset_of<decltype(get_class_type(m)), \
                     decltype(get_member_type(m)), m>()

#define comma ,

__thread long long rcuthrFreesNode = 0;       // for RCU THREADS ONLY
__thread long long rcuthrFreesDescriptor = 0; // for RCU THREADS ONLY
long long freesNode = 0;
long long freesDescriptor = 0;

#if defined BST || defined BST_THROWAWAY
    void rcuCallback_Node(struct rcu_head *rcu) {
        Node<test_type, test_type> * n = (Node<test_type, test_type> *)
                (((char*) rcu) - OFFSET_OF(
                        &Node<test_type comma test_type>::rcuHeadField));
        if (++rcuthrFreesNode == 1<<10) {
            __sync_fetch_and_add(&freesNode, rcuthrFreesNode);
            rcuthrFreesNode = 0;
        }
        free(n);
    }
    #ifdef BST_THROWAWAY
        void rcuCallback_SCXRecord(struct rcu_head *rcu) {
            SCXRecord<test_type, test_type> * n = (SCXRecord<test_type, test_type> *)
                    (((char*) rcu) - OFFSET_OF(
                            &SCXRecord<test_type comma test_type>::rcuHeadField));
            if (++rcuthrFreesDescriptor == 1<<10) {
                __sync_fetch_and_add(&freesDescriptor, rcuthrFreesDescriptor);
                rcuthrFreesDescriptor = 0;
            }
            free(n);
        }
    #endif
#elif defined KCAS_MAXK
    void rcuCallback_kcasdesc(struct rcu_head *rcu) {
        kcasdesc_t<KCAS_MAXK, KCAS_MAXTHREADS> * n = (kcasdesc_t<KCAS_MAXK, KCAS_MAXTHREADS> *)
                (((char*) rcu) - OFFSET_OF(
                        &kcasdesc_t<KCAS_MAXK comma KCAS_MAXTHREADS>::rcuHeadField));
        if (++rcuthrFreesDescriptor == 1<<10) {
            __sync_fetch_and_add(&freesDescriptor, rcuthrFreesDescriptor);
            rcuthrFreesDescriptor = 0;
        }
        free(n);
    }
    void rcuCallback_rdcssdesc(struct rcu_head *rcu) {
        rdcssdesc_t * n = (rdcssdesc_t *)
                (((char*) rcu) - OFFSET_OF(
                        &rdcssdesc_t::rcuHeadField));
        if (++rcuthrFreesNode == 1<<10) {
            __sync_fetch_and_add(&freesNode, rcuthrFreesNode);
            rcuthrFreesNode = 0;
        }
        free(n);
    }
#endif

//template <typename T>
//void rcuCallback(struct rcu_head *rcu) {
//    T * n = (T *) (((char *) rcu) - OFFSET_OF(&T::rcuHeadField));
//    free(n);
//}

__thread bool calledRCULock = false;
__thread bool rcuInitialized = false;

template <typename T = void, class Pool = pool_interface<T> >
class reclaimer_rcu : public reclaimer_interface<T, Pool> {
protected:
    
public:
    template<typename _Tp1>
    struct rebind {
        typedef reclaimer_rcu<_Tp1, Pool> other;
    };
    template<typename _Tp1, typename _Tp2>
    struct rebind2 {
        typedef reclaimer_rcu<_Tp1, _Tp2> other;
    };
    
    long long getSizeInNodes() {
        long long sum = 0;
        return sum;
    }
    string getSizeString() {
        stringstream ss;
        ss<<getSizeInNodes()<<" in reclaimer_rcu";
        return ss.str();
    }
    
    inline static bool quiescenceIsPerRecordType() { return false; }
    
    inline bool isQuiescent(const int tid) {
        return false;
    }

    inline static bool isProtected(const int tid, T * const obj) {
        return true;
    }
    inline static bool isQProtected(const int tid, T * const obj) {
        return false;
    }
    inline static bool protect(const int tid, T * const obj, CallbackType notRetiredCallback, CallbackArg callbackArg, bool memoryBarrier = true) {
        return true;
    }
    inline static void unprotect(const int tid, T * const obj) {}
    inline static bool qProtect(const int tid, T * const obj, CallbackType notRetiredCallback, CallbackArg callbackArg, bool memoryBarrier = true) {
        return true;
    }
    inline static void qUnprotectAll(const int tid) {}
    
    inline static bool shouldHelp() { return true; }
    
    inline void rotateEpochBags(const int tid) {}
    
    inline bool leaveQuiescentState(const int tid, void * const * const reclaimers, const int numReclaimers) {
        if (!calledRCULock) {
            rcu_read_lock();
            calledRCULock = true;
        }
        return true;
    }
    
    inline void enterQuiescentState(const int tid) {
        if (calledRCULock) {
            rcu_read_unlock();
            calledRCULock = false;
        }
    }
    
    // for all schemes except reference counting
    inline void retire(const int tid, T* p) {
//        call_rcu(&p->rcuHeadField, rcuCallback<T>);
#if defined BST || defined BST_THROWAWAY
        if (sizeof(*p) == sizeof(Node<test_type, test_type>)) {
            call_rcu(&p->rcuHeadField, rcuCallback_Node);
#ifdef BST_THROWAWAY
        } else if (sizeof(*p) == sizeof(SCXRecord<test_type, test_type>)) {
            call_rcu(&p->rcuHeadField, rcuCallback_SCXRecord);
#endif
        }
#elif defined KCAS_MAXK
        if (sizeof(*p) == sizeof(kcasdesc_t<KCAS_MAXK comma KCAS_MAXTHREADS>)) {
            call_rcu(&p->rcuHeadField, rcuCallback_kcasdesc);
        } else if (sizeof(*p) == sizeof(rdcssdesc_t)) {
            call_rcu(&p->rcuHeadField, rcuCallback_rdcssdesc);
        }
#endif
    }

    void debugPrintStatus(const int tid) {
        if (freesNode) std::cout<<"freesNode="<<freesNode<<std::endl;
        if (freesDescriptor) std::cout<<"freesDescriptor="<<freesDescriptor<<std::endl;
    }
    
    void initThread(const int tid) {
        if (!rcuInitialized) {
            rcu_register_thread();
            struct call_rcu_data * crdp = create_call_rcu_data(0,-1);
            set_thread_call_rcu_data(crdp);
            rcuInitialized = true;
        }
    }
    
    void deinitThread(const int tid) {
        if (rcuInitialized) {
            rcu_unregister_thread();
            rcuInitialized = false;
        }
    }

    reclaimer_rcu(const int numProcesses, Pool *_pool, debugInfo * const _debug, RecoveryMgr<void *> * const _recoveryMgr = NULL)
            : reclaimer_interface<T, Pool>(numProcesses, _pool, _debug, _recoveryMgr) {
        rcu_init();
    }
    ~reclaimer_rcu() {}
};

#endif


================================================
FILE: datastructures/trevor_brown_abtree/common/recordmgr/record_manager.h
================================================
/**
 * C++ record manager implementation (PODC 2015) by Trevor Brown.
 * 
 * Copyright (C) 2015 Trevor Brown
 *
 */

#ifndef RECORD_MANAGER_H
#define	RECORD_MANAGER_H

#include <atomic>
#include "globals.h"
#include "record_manager_single_type.h"

#include <iostream>
#include <exception>
#include <stdexcept>
#include <typeinfo>
using namespace std;

inline CallbackReturn callbackReturnTrue(CallbackArg arg) {
    return true;
}

// compile time check for duplicate template parameters
// compare first with rest to find any duplicates
template <typename T> void check_duplicates(void) {}
template <typename T, typename First, typename... Rest>
void check_duplicates(void) {
    if (typeid(T) == typeid(First)) {
        throw logic_error("duplicate template arguments provided to RecordManagerSet");
    }
    check_duplicates<T, Rest...>();
}

// base case: empty template
// this is a compile time check for invalid arguments
template <class Reclaim, class Alloc, class Pool, typename... Rest>
class RecordManagerSet {
public:
    RecordManagerSet(const int numProcesses, RecoveryMgr<void *> * const _recoveryMgr) {}
    template <typename T>
    record_manager_single_type<T, Reclaim, Alloc, Pool> * get(T * const recordType) {
        throw logic_error("invalid type passed to RecordManagerSet::get()");
        return NULL;
    }
    void clearCounters(void) {}
    void registerThread(const int tid) {}
    void unregisterThread(const int tid) {}
    void printStatus() {}
    inline void qUnprotectAll(const int tid) {}
    inline void getReclaimers(const int tid, void ** const reclaimers, int index) {}
    inline void enterQuiescentState(const int tid) {}
    inline void leaveQuiescentStateForEach(const int tid) {}
    inline void leaveQuiescentState(const int tid, const bool callForEach) {}
};

// "recursive" case
template <class Reclaim, class Alloc, class Pool, typename First, typename... Rest>
class RecordManagerSet<Reclaim, Alloc, Pool, First, Rest...> : RecordManagerSet<Reclaim, Alloc, Pool, Rest...> {
    record_manager_single_type<First, Reclaim, Alloc, Pool> * const mgr;
public:
    RecordManagerSet(const int numProcesses, RecoveryMgr<void *> * const _recoveryMgr)
        : RecordManagerSet<Reclaim, Alloc, Pool, Rest...>(numProcesses, _recoveryMgr)
        , mgr(new record_manager_single_type<First, Reclaim, Alloc, Pool>(numProcesses, _recoveryMgr))
        {
        //cout<<"RecordManagerSet with First="<<typeid(First).name()<<" and sizeof...(Rest)="<<sizeof...(Rest)<<std::endl;
        check_duplicates<First, Rest...>(); // check if first is in {rest...}
    }
    ~RecordManagerSet() {
        delete mgr;
        // note: should automatically call the parent class' destructor afterwards
    }
    // note: the compiled code for get() should be a single read and return statement
    template<typename T>
    inline record_manager_single_type<T, Reclaim, Alloc, Pool> * get(T * const recordType) {
        if (typeid(First) == typeid(T)) {
            //cout<<"MATCH: typeid(First)="<<typeid(First).name()<<" typeid(T)="<<typeid(T).name()<<std::endl;
            return (record_manager_single_type<T, Reclaim, Alloc, Pool> *) mgr;
        } else {
            //cout<<"NO MATCH: typeid(First)="<<typeid(First).name()<<" typeid(T)="<<typeid(T).name()<<std::endl;
            return ((RecordManagerSet<Reclaim, Alloc, Pool, Rest...> *) this)->get(recordType);
        }
    }
    // note: recursion should be compiled out
    void clearCounters(void) {
        mgr->clearCounters();
        ((RecordManagerSet<Reclaim, Alloc, Pool, Rest...> *) this)->clearCounters();
    }
    void registerThread(const int tid) {
        mgr->initThread(tid);
        ((RecordManagerSet<Reclaim, Alloc, Pool, Rest...> *) this)->registerThread(tid);
    }
    void unregisterThread(const int tid) {
        mgr->deinitThread(tid);
        ((RecordManagerSet<Reclaim, Alloc, Pool, Rest...> *) this)->unregisterThread(tid);
    }
    void printStatus() {
        mgr->printStatus();
        ((RecordManagerSet<Reclaim, Alloc, Pool, Rest...> *) this)->printStatus();
    }
    inline void qUnprotectAll(const int tid) {
        mgr->qUnprotectAll(tid);
        ((RecordManagerSet<Reclaim, Alloc, Pool, Rest...> *) this)->qUnprotectAll(tid);
    }
    inline void getReclaimers(const int tid, void ** const reclaimers, int index) {
        reclaimers[index] = mgr->reclaim;
        ((RecordManagerSet <Reclaim, Alloc, Pool, Rest...> *) this)->getReclaimers(tid, reclaimers, 1+index);
    }
    inline void enterQuiescentState(const int tid) {
        mgr->enterQuiescentState(tid);
        ((RecordManagerSet<Reclaim, Alloc, Pool, Rest...> *) this)->enterQuiescentState(tid);
    }
    inline void leaveQuiescentStateForEach(const int tid) {
        mgr->leaveQuiescentState(tid, NULL, 0);
        ((RecordManagerSet <Reclaim, Alloc, Pool, Rest...> *) this)->leaveQuiescentStateForEach(tid);
    }
    inline void leaveQuiescentState(const int tid, const bool callForEach) {
        if (callForEach) {
            leaveQuiescentStateForEach(tid);
        } else {
            void * reclaimers[1+sizeof...(Rest)];
            getReclaimers(tid, reclaimers, 0);
            get((First *) NULL)->leaveQuiescentState(tid, reclaimers, 1+sizeof...(Rest));
            __sync_synchronize(); // memory barrier needed (only) for epoch based schemes at the moment...
        }
    }
};

template <class Reclaim, class Alloc, class Pool, typename RecordTypesFirst, typename... RecordTypesRest>
class record_manager {
protected:
    typedef record_manager<Reclaim,Alloc,Pool,RecordTypesFirst,RecordTypesRest...> SelfType;
    RecordManagerSet<Reclaim,Alloc,Pool,RecordTypesFirst,RecordTypesRest...> * rmset;
    
    int init[MAX_TID_POW2] = {0,};

public:
    const int NUM_PROCESSES;
    RecoveryMgr<SelfType> * const recoveryMgr;
    
    record_manager(const int numProcesses, const int _neutralizeSignal)
            : NUM_PROCESSES(numProcesses)
            , recoveryMgr(new RecoveryMgr<SelfType>(numProcesses, _neutralizeSignal, this))
    {
        rmset = new RecordManagerSet<Reclaim, Alloc, Pool, RecordTypesFirst, RecordTypesRest...>(numProcesses, (RecoveryMgr<void *> *) recoveryMgr);
    }
    ~record_manager() {
        delete recoveryMgr;
        delete rmset;
    }
    void initThread(const int tid) {
        if (init[tid]) return; else init[tid] = !init[tid];

        rmset->registerThread(tid);
        recoveryMgr->initThread(tid);
        enterQuiescentState(tid);
    }
    void deinitThread(const int tid) {
        if (!init[tid]) return; else init[tid] = !init[tid];

        rmset->unregisterThread(tid);
    }
    void clearCounters() {
        rmset->clearCounters();
    }
    void printStatus(void) {
        rmset->printStatus();
    }
    template <typename T>
    debugInfo * getDebugInfo(T * const recordType) {
        return &rmset->get((T *) NULL)->debugInfoRecord;
    }
    template<typename T>
    inline record_manager_single_type<T, Reclaim, Alloc, Pool> * get(T * const recordType) {
        return rmset->get((T *) NULL);
    }
    
    // for hazard pointers

    template <typename T>
    inline bool isProtected(const int tid, T * const obj) {
        return rmset->get((T *) NULL)->isProtected(tid, obj);
    }
    
    template <typename T>
    inline bool protect(const int tid, T * const obj, CallbackType notRetiredCallback, CallbackArg callbackArg, bool hintMemoryBarrier = true) {
        return rmset->get((T *) NULL)->protect(tid, obj, notRetiredCallback, callbackArg, hintMemoryBarrier);
    }
    
    template <typename T>
    inline void unprotect(const int tid, T * const obj) {
        rmset->get((T *) NULL)->unprotect(tid, obj);
    }
    
    // for DEBRA+
    
    // warning: qProtect must be reentrant and lock-free (i.e., async-signal-safe)
    template <typename T>
    inline bool qProtect(const int tid, T * const obj, CallbackType notRetiredCallback, CallbackArg callbackArg, bool hintMemoryBarrier = true) {
        return rmset->get((T *) NULL)->qProtect(tid, obj, notRetiredCallback, callbackArg, hintMemoryBarrier);
    }
    
    template <typename T>
    inline bool isQProtected(const int tid, T * const obj) {
        return rmset->get((T *) NULL)->isQProtected(tid, obj);
    }
    
    inline void qUnprotectAll(const int tid) {
        assert(!Reclaim::supportsCrashRecovery() || isQuiescent(tid));
        rmset->qUnprotectAll(tid);
    }

    // for epoch based reclamation
    inline bool isQuiescent(const int tid) {
        return rmset->get((RecordTypesFirst *) NULL)->isQuiescent(tid); // warning: if quiescence information is logically shared between all types, with the actual data being associated only with the first type (as it is here), then isQuiescent will return inconsistent results if called in functions that recurse on the template argument list in this class.
    }
    inline void enterQuiescentState(const int tid) {
//        VERBOSE DEBUG2 COUTATOMIC("record_manager_single_type::enterQuiescentState(tid="<<tid<<")"<<std::endl);
        if (Reclaim::quiescenceIsPerRecordType()) {
//            std::cout<<"setting quiescent state for all record types\n";
            rmset->enterQuiescentState(tid);
        } else {
            // only call enterQuiescentState for one object type
//            std::cout<<"setting quiescent state for just one record type: "<<typeid(RecordTypesFirst).name()<<"\n";
            rmset->get((RecordTypesFirst *) NULL)->enterQuiescentState(tid);
        }
    }
    inline void leaveQuiescentState(const int tid) {
//        assert(isQuiescent(tid));
//        VERBOSE DEBUG2 COUTATOMIC("record_manager_single_type::leaveQuiescentState(tid="<<tid<<")"<<std::endl);
        // for some types of reclaimers, different types of records retired in the same
        // epoch can be reclaimed together (by aggregating their epochs), so we don't actually need
        // separate calls to leaveQuiescentState for each object type.
        // if appropriate, we make a single call to leaveQuiescentState,
        // and it takes care of all record types managed by this record manager.
        //cout<<"quiescenceIsPerRecordType = "<<Reclaim::quiescenceIsPerRecordType()<<std::endl;
        rmset->leaveQuiescentState(tid, Reclaim::quiescenceIsPerRecordType());
    }

    // for algorithms that retire nodes before a deletion is linearized
    template <typename T>
    inline void unretireLast(const int tid) {
        assert(!Reclaim::supportsCrashRecovery() || isQuiescent(tid));
        rmset->get((T *) NULL)->unretireLast(tid);
    }

    // for all schemes
    template <typename T>
    inline void retire(const int tid, T * const p) {
        assert(!Reclaim::supportsCrashRecovery() || isQuiescent(tid));
        rmset->get((T *) NULL)->retire(tid, p);
    }

    template <typename T>
    inline T * allocate(const int tid) {
        assert(!Reclaim::supportsCrashRecovery() || isQuiescent(tid));
        return rmset->get((T *) NULL)->allocate(tid);
    }
    
    // optional function which can be used if it is safe to call free()
    template <typename T>
    inline void deallocate(const int tid, T * const p) {
        assert(!Reclaim::supportsCrashRecovery() || isQuiescent(tid));
        rmset->get((T *) NULL)->deallocate(tid, p);
    }

    inline static bool shouldHelp() { // FOR DEBUGGING PURPOSES
        return Reclaim::shouldHelp();
    }
    inline static bool supportsCrashRecovery() {
        return Reclaim::supportsCrashRecovery();
    }
};

#endif


================================================
FILE: datastructures/trevor_brown_abtree/common/recordmgr/record_manager_single_type.h
================================================
/**
 * C++ record manager implementation (PODC 2015) by Trevor Brown.
 * 
 * Copyright (C) 2015 Trevor Brown
 *
 */

#ifndef RECORD_MANAGER_SINGLE_TYPE_H
#define	RECORD_MANAGER_SINGLE_TYPE_H

#include <pthread.h>
#include <errno.h>
#include <cstring>
#include <iostream>
#include <typeinfo>

#include "plaf.h"
#include "debug_info.h"
#include "globals.h"

#include "recovery_manager.h"

#include "allocator_interface.h"
#include "allocator_bump.h"
#include "allocator_new.h"
#include "allocator_new_segregated.h"
#include "allocator_once.h"

#include "pool_interface.h"
#include "pool_none.h"
#include "pool_perthread_and_shared.h"

#include "reclaimer_interface.h"
#include "reclaimer_none.h"
#include "reclaimer_debra.h"
#include "reclaimer_debraplus.h"
#include "reclaimer_hazardptr.h"
#ifdef USE_RECLAIMER_RCU
#include "reclaimer_rcu.h"
#endif

using namespace std;

// maybe Record should be a size
template <typename Record, class Reclaim, class Alloc, class Pool>
class record_manager_single_type {
protected:
    typedef Record* record_pointer;

    typedef typename Alloc::template    rebind<Record>::other              classAlloc;
    typedef typename Pool::template     rebind2<Record, classAlloc>::other classPool;
    typedef typename Reclaim::template  rebind2<Record, classPool>::other  classReclaim;
    
public:
    classAlloc      *alloc;
    classPool       *pool;
    classReclaim    *reclaim;
    
    const int NUM_PROCESSES;
    debugInfo debugInfoRecord;
    RecoveryMgr<void *> * const recoveryMgr;

    record_manager_single_type(const int numProcesses, RecoveryMgr<void *> * const _recoveryMgr)
            : NUM_PROCESSES(numProcesses), debugInfoRecord(debugInfo(numProcesses)), recoveryMgr(_recoveryMgr) {
        VERBOSE DEBUG COUTATOMIC("constructor record_manager_single_type"<<std::endl);
        alloc = new classAlloc(numProcesses, &debugInfoRecord);
        pool = new classPool(numProcesses, alloc, &debugInfoRecord);
        reclaim = new classReclaim(numProcesses, pool, &debugInfoRecord, recoveryMgr);
    }
    ~record_manager_single_type() {
        VERBOSE DEBUG COUTATOMIC("destructor record_manager_single_type"<<std::endl);
        delete reclaim;
        delete pool;
        delete alloc;
    }

    void initThread(const int tid) {
        alloc->initThread(tid);
        reclaim->initThread(tid);
//        enterQuiescentState(tid);
    }
    
    void deinitThread(const int tid) {
        reclaim->deinitThread(tid);
    }
    
    inline void clearCounters() {
        debugInfoRecord.clear();
    }

    inline static bool shouldHelp() { // FOR DEBUGGING PURPOSES
        return Reclaim::shouldHelp();
    }
    inline bool isProtected(const int tid, record_pointer obj) {
        return reclaim->isProtected(tid, obj);
    }
    // for hazard pointers (and reference counting)
    inline bool protect(const int tid, record_pointer obj, CallbackType notRetiredCallback, CallbackArg callbackArg, bool hintMemoryBarrier = true) {
        return reclaim->protect(tid, obj, notRetiredCallback, callbackArg, hintMemoryBarrier);
    }
    inline void unprotect(const int tid, record_pointer obj) {
        reclaim->unprotect(tid, obj);
    }
    // warning: qProtect must be reentrant and lock-free (=== async-signal-safe)
    inline bool qProtect(const int tid, record_pointer obj, CallbackType notRetiredCallback, CallbackArg callbackArg, bool hintMemoryBarrier = true) {
        return reclaim->qProtect(tid, obj, notRetiredCallback, callbackArg, hintMemoryBarrier);
    }
    inline void qUnprotectAll(const int tid) {
        assert(!Reclaim::supportsCrashRecovery() || isQuiescent(tid));
        reclaim->qUnprotectAll(tid);
    }
    inline bool isQProtected(const int tid, record_pointer obj) {
        return reclaim->isQProtected(tid, obj);
    }
    
    inline static bool supportsCrashRecovery() {
        return Reclaim::supportsCrashRecovery();
    }
    inline static bool quiescenceIsPerRecordType() {
        return Reclaim::quiescenceIsPerRecordType();
    }
    inline bool isQuiescent(const int tid) {
        return reclaim->isQuiescent(tid);
    }

    // for epoch based reclamation
    inline void enterQuiescentState(const int tid) {
//        VERBOSE DEBUG2 COUTATOMIC("record_manager_single_type::enterQuiescentState(tid="<<tid<<")"<<std::endl);
        reclaim->enterQuiescentState(tid);
    }
    inline void leaveQuiescentState(const int tid, void * const * const reclaimers, const int numReclaimers) {
//        assert(isQuiescent(tid));
        reclaim->leaveQuiescentState(tid, reclaimers, numReclaimers);
    }

    // for all schemes except reference counting
    inline void retire(const int tid, record_pointer p) {
        assert(!Reclaim::supportsCrashRecovery() || isQuiescent(tid));
        reclaim->retire(tid, p);
    }
    
    // for algs that retire before the linearization point of a deletion
    inline void unretireLast(const int tid) {
        assert(!Reclaim::supportsCrashRecovery() || isQuiescent(tid));
        reclaim->unretireLast(tid);
    }

    // for all schemes
    inline record_pointer allocate(const int tid) {
        assert(!Reclaim::supportsCrashRecovery() || isQuiescent(tid));
        return pool->get(tid);
    }
    inline void deallocate(const int tid, record_pointer p) {
        assert(!Reclaim::supportsCrashRecovery() || isQuiescent(tid));
        pool->add(tid, p);
    }

    void printStatus(void) {
        long long allocated = debugInfoRecord.getTotalAllocated();
        long long allocatedBytes = allocated * sizeof(Record);
        long long deallocated = debugInfoRecord.getTotalDeallocated();
        long long recycled = debugInfoRecord.getTotalFromPool() - allocated;
        COUTATOMIC("recmgr status for objects of size "<<sizeof(Record)<<" and type "<<typeid(Record).name()<<std::endl);
        COUTATOMIC("allocated   : "<<allocated<<" objects totaling "<<allocatedBytes<<" bytes ("<<(allocatedBytes/1000000.)<<"MB)"<<std::endl);
        COUTATOMIC("recycled    : "<<recycled<<std::endl);
        COUTATOMIC("deallocated : "<<deallocated<<" objects"<<std::endl);
        COUTATOMIC("pool        : "<<pool->getSizeString()<<std::endl);
        COUTATOMIC("reclaim     : "<<reclaim->getSizeString()<<std::endl);
        COUTATOMIC("unreclaimed : "<<(allocated - deallocated - atoi(reclaim->getSizeString().c_str()))<<std::endl);
//        COUTATOMIC(endl);

        for (int tid=0;tid<NUM_PROCESSES;++tid) {
            reclaim->debugPrintStatus(tid);
        }
        COUTATOMIC(endl);
        
//        for (int tid=0;tid<this->NUM_PROCESSES;++tid) {
//            COUTATOMIC("thread "<<tid<<" ");
//            alloc->debugPrintStatus(tid);
//            
//            COUTATOMIC("    ");
//            //COUTATOMIC("allocated "<<debugInfoRecord.getAllocated(tid)<<" Nodes");
//            //COUTATOMIC("allocated "<<(debugInfoRecord.getAllocated(tid) / 1000)<<"k Nodes");
//            //COUTATOMIC(" ");
//            reclaim->debugPrintStatus(tid);
//            COUTATOMIC(" ");
//            pool->debugPrintStatus(tid);
//            COUTATOMIC(" ");
//            COUTATOMIC("(given="<<debugInfoRecord.getGiven(tid)<<" taken="<<debugInfoRecord.getTaken(tid)<<") toPool="<<debugInfoRecord.getToPool(tid)<<" fromPool="<<debugInfoRecord.getFromPool(tid));
//            COUTATOMIC(endl);
//        }
    }
};

#endif

================================================
FILE: datastructures/trevor_brown_abtree/common/recordmgr/recovery_manager.h
================================================
/**
 * C++ record manager implementation (PODC 2015) by Trevor Brown.
 * 
 * Copyright (C) 2015 Trevor Brown
 *
 */

#ifndef RECOVERY_MANAGER_H
#define	RECOVERY_MANAGER_H

#ifdef __CYGWIN__
    struct sigaction { void * sa_sigaction; int sa_flags; int sa_mask; };
    struct siginfo_t {};
    struct sigjmp_buf {};
    #define sigsetjmp(buf, flags) 0
    #define siglongjmp(buf, flags) 0
    #define sigemptyset(x) 0
    #define sigfillset(x) 0
    #define sigaddset(x, sig) 0
    #define pthread_sigmask(sig, set, nul) 0
    #define SIG_UNBLOCK 0
    #define SA_RESTART 0
    #define SA_SIGINFO 0
#else
    #include <setjmp.h>
#endif

#ifndef VERBOSE
    #define VERBOSE if(0)
#endif
    
#include <cassert>
#include <csignal>
#include "globals.h"
#ifdef USE_DEBUGCOUNTERS
    #include "debugcounter.h"
#endif

// for crash recovery
static pthread_key_t pthreadkey;
static struct sigaction ___act;
static void *___singleton = NULL;
extern pthread_key_t pthreadkey;
extern struct sigaction ___act;
extern void *___singleton;

static pthread_t registeredThreads[MAX_TID_POW2];
static void *errnoThreads[MAX_TID_POW2];
static sigjmp_buf *setjmpbuffers;
extern pthread_t registeredThreads[MAX_TID_POW2];
extern void *errnoThreads[MAX_TID_POW2];
extern sigjmp_buf *setjmpbuffers;

#ifdef USE_DEBUGCOUNTERS
static debugCounter countInterrupted(MAX_TID_POW2);
static debugCounter countLongjmp(MAX_TID_POW2);
extern debugCounter countInterrupted;
extern debugCounter countLongjmp;
#endif
#define MAX_THREAD_ADDR 10000

#ifdef CRASH_RECOVERY_USING_SETJMP
#define CHECKPOINT_AND_RUN_UPDATE(tid, finishedbool) \
    if (MasterRecordMgr::supportsCrashRecovery() && sigsetjmp(setjmpbuffers[(tid)], 0)) { \
        recordmgr->enterQuiescentState((tid)); \
        (finishedbool) = recoverAnyAttemptedSCX((tid), -1); \
        recordmgr->recoveryMgr->unblockCrashRecoverySignal(); \
    } else
#define CHECKPOINT_AND_RUN_QUERY(tid) \
    if (MasterRecordMgr::supportsCrashRecovery() && sigsetjmp(setjmpbuffers[(tid)], 0)) { \
        recordmgr->enterQuiescentState((tid)); \
        recordmgr->recoveryMgr->unblockCrashRecoverySignal(); \
    } else
#endif

// warning: this crash recovery code will only work if you've created a SINGLE instance of bst during an execution.
// there are ways to make it work for multiple instances; i just haven't done that.
template <class MasterRecordMgr>
void crashhandler(int signum, siginfo_t *info, void *uctx) {
    MasterRecordMgr * const recordmgr = (MasterRecordMgr * const) ___singleton;
#ifdef SIGHANDLER_IDENTIFY_USING_PTHREAD_GETSPECIFIC
    int tid = (int) ((long) pthread_getspecific(pthreadkey));
#endif
    TRACE COUTATOMICTID("received signal "<<signum<<std::endl);

    // if i'm active (not in a quiescent state), i must throw an exception
    // and clean up after myself, instead of continuing my operation.
#ifdef USE_DEBUGCOUNTERS
    DEBUG countInterrupted.inc(tid);
#endif
    __sync_synchronize();
    if (!recordmgr->isQuiescent(tid)) {
#ifdef PERFORM_RESTART_IN_SIGHANDLER
        recordmgr->enterQuiescentState(tid);
    #ifdef USE_DEBUGCOUNTERS
        DEBUG countLongjmp.inc(tid);
    #endif
        __sync_synchronize();
    #ifdef CRASH_RECOVERY_USING_SETJMP
        siglongjmp(setjmpbuffers[tid], 1);
    #endif
#endif
    }
    // otherwise, i simply continue my operation as if nothing happened.
    // this lets me behave nicely when it would be dangerous for me to be
    // restarted (being in a Q state is analogous to having interrupts 
    // disabled in an operating system kernel; however, whereas disabling
    // interrupts blocks other processes' progress, being in a Q state
    // implies that you cannot block the progress of any other thread.)
}

template <class MasterRecordMgr>
class RecoveryMgr {
public:
    const int NUM_PROCESSES;
    const int neutralizeSignal;
    
    inline int getTidInefficient(const pthread_t me) {
        int tid = -1;
        for (int i=0;i<NUM_PROCESSES;++i) {
            if (pthread_equal(registeredThreads[i], me)) {
                tid = i;
            }
        }
        // fail to find my tid -- should be impossible
        if (tid == -1) {
            COUTATOMIC("THIS SHOULD NEVER HAPPEN"<<std::endl);
            assert(false);
            exit(-1);
        }
        return tid;
    }
    inline int getTidInefficientErrno() {
        int tid = -1;
        for (int i=0;i<NUM_PROCESSES;++i) {
            // here, we use the fact that errno is defined to be a thread local variable
            if (&errno == errnoThreads[i]) {
                tid = i;
            }
        }
        // fail to find my tid -- should be impossible
        if (tid == -1) {
            COUTATOMIC("THIS SHOULD NEVER HAPPEN"<<std::endl);
            assert(false);
            exit(-1);
        }
        return tid;
    }
    inline int getTid_pthread_getspecific() {
        void * result = pthread_getspecific(pthreadkey);
        if (!result) {
            assert(false);
            COUTATOMIC("ERROR: failed to get thread id using pthread_getspecific"<<std::endl);
            exit(-1);
        }
        return (int) ((long) result);
    }
    inline pthread_t getPthread(const int tid) {
        return registeredThreads[tid];
    }
    
    void initThread(const int tid) {
        // create mapping between tid and pthread_self for the signal handler
        // and for any thread that neutralizes another
        registeredThreads[tid] = pthread_self();

        // here, we use the fact that errno is defined to be a thread local variable
        errnoThreads[tid] = &errno;
        if (pthread_setspecific(pthreadkey, (void*) (long) tid)) {
            COUTATOMIC("ERROR: failure of pthread_setspecific for tid="<<tid<<std::endl);
        }
        const long __readtid = (long) ((int *) pthread_getspecific(pthreadkey));
        VERBOSE DEBUG COUTATOMICTID("did pthread_setspecific, pthread_getspecific of "<<__readtid<<std::endl);
        assert(__readtid == tid);
    }
    
    void unblockCrashRecoverySignal() {
        __sync_synchronize();
        sigset_t oldset;
        sigemptyset(&oldset);
        sigaddset(&oldset, neutralizeSignal);
        if (pthread_sigmask(SIG_UNBLOCK, &oldset, NULL)) {
            VERBOSE COUTATOMIC("ERROR UNBLOCKING SIGNAL"<<std::endl);
            exit(-1);
        }
    }
    
    RecoveryMgr(const int numProcesses, const int _neutralizeSignal, MasterRecordMgr * const masterRecordMgr)
            : NUM_PROCESSES(numProcesses) , neutralizeSignal(_neutralizeSignal){
        setjmpbuffers = new sigjmp_buf[numProcesses];
        pthread_key_create(&pthreadkey, NULL);
        
#ifndef __CYGWIN__
        if (MasterRecordMgr::supportsCrashRecovery()) {
            // set up crash recovery signal handling for this process
            memset(&___act, 0, sizeof(___act));
            ___act.sa_sigaction = crashhandler<MasterRecordMgr>; // specify signal handler
            ___act.sa_flags = SA_RESTART | SA_SIGINFO; // restart any interrupted sys calls instead of silently failing
            sigfillset(&___act.sa_mask);               // block signals during handler
            if (sigaction(_neutralizeSignal, &___act, NULL)) {
                COUTATOMIC("ERROR: could not register signal handler for signal "<<_neutralizeSignal<<std::endl);
                assert(false);
                exit(-1);
            } else {
                VERBOSE COUTATOMIC("registered signal "<<_neutralizeSignal<<" for crash recovery"<<std::endl);
            }
        }
#endif
        // set up shared pointer to this class instance for the signal handler
        ___singleton = (void *) masterRecordMgr;
    }
    ~RecoveryMgr() {
        delete[] setjmpbuffers;
    }
};

#endif	/* RECOVERY_MANAGER_H */


================================================
FILE: datastructures/trevor_brown_abtree/common/rq/rq_dcssp.h
================================================
/* 
 * File:   rq_dcssp.h
 * Author: trbot
 *
 * Created on May 9, 2017, 4:30 PM
 */

#ifndef RQ_DCSSP_H
#define	RQ_DCSSP_H

#ifdef ADD_DELAY_BEFORE_DTIME
extern Random rngs[MAX_TID_POW2*PREFETCH_SIZE_WORDS];
#define GET_RAND(tid,n) (rngs[(tid)*PREFETCH_SIZE_WORDS].nextNatural((n)))
#define DELAY_UP_TO(n) { \
    unsigned __r = GET_RAND(tid,(n)); \
    for (int __i=0;__i<__r;++__i) { \
        SOFTWARE_BARRIER; \
    } \
}
#else
#define DELAY_UP_TO(n) 
#endif

#ifdef RQ_LOCKFREE_WAITS_FOR_DTIME
#define WAIT_FOR_DTIME(node) ({ while ((node)->dtime == TIMESTAMP_NOT_SET) ; true; })
#else
#define WAIT_FOR_DTIME(node) ({ false; })
#endif

#include <pthread.h>
#include <hashlist.h>
#include "rq_debugging.h"
#include "dcss_plus_impl.h"

template <typename T>
inline bool contains(T ** nullTerminatedArray, T * element) {
    for (int i=0;nullTerminatedArray[i];++i) {
        if (nullTerminatedArray[i] == element) return true;
    }
    return false;
}

template <typename T>
inline bool contains(T * array, const int numElements, T element) {
    for (int i=0;i<numElements;++i) {
        if (array[i] == element) return true;
    }
    return false;
}

#define SNAPSHOT_CONTAINS_INSERTED_NODE(snap, node) contains((void **) (snap).payload1, (void *) (node))
#define SNAPSHOT_CONTAINS_DELETED_NODE(snap, node) contains((void **) (snap).payload2, (void *) (node))

template <typename K, typename V, typename NodeType, typename DataStructure, typename RecordManager, bool logicalDeletion, bool canRetireNodesLogicallyDeletedByOtherProcesses>
class RQProvider {
private:
    struct __rq_thread_data {
        #define __RQ_THREAD_DATA_SIZE 1024
        #define MAX_NODES_DELETED_ATOMICALLY 8
        #define CODE_COVERAGE_MAX_PATHS 11
        union {
            struct { // anonymous struct inside anonymous union means we don't need to type anything special to access these variables
                long long rq_lin_time;
                HashList<K> * hashlist;
#ifdef COUNT_CODE_PATH_EXECUTIONS
                long long codePathExecutions[CODE_COVERAGE_MAX_PATHS];
#endif
                volatile char padding0[PREFETCH_SIZE_BYTES];
                void * announcements[MAX_NODES_DELETED_ATOMICALLY];
                int numAnnouncements;
            };
            char bytes[__RQ_THREAD_DATA_SIZE]; // avoid false sharing (note: anon struct above contains around 96 bytes)
        };
    } __attribute__((aligned(__RQ_THREAD_DATA_SIZE)));
    
#ifdef COUNT_CODE_PATH_EXECUTIONS
    #define COUNT_CODE_PATH(path) { assert((path) < CODE_COVERAGE_MAX_PATHS); (++threadData[tid].codePathExecutions[(path)]); }
    long long codePathExecutions[CODE_COVERAGE_MAX_PATHS];
#else
    #define COUNT_CODE_PATH(path) 
#endif

    #define TIMESTAMP_NOT_SET 0
    #define HASHLIST_INIT_CAPACITY_POW2 (1<<8)

    const int NUM_PROCESSES;
    volatile char padding0[PREFETCH_SIZE_BYTES];
    volatile long long timestamp = 1;
    volatile char padding1[PREFETCH_SIZE_BYTES];
    __rq_thread_data * threadData;

    #define NODE_DELETED_BEFORE_RQ 0
    #define NODE_DELETED_AFTER_RQ 1
    #define NODE_NOT_DELETED_BY_THREAD -1
    dcsspProvider<void *> * prov;
    
    DataStructure * ds;
    RecordManager * const recmgr;

    int init[MAX_TID_POW2] = {0,};

public:
    RQProvider(const int numProcesses, DataStructure * ds, RecordManager * recmgr) : NUM_PROCESSES(numProcesses), ds(ds), recmgr(recmgr) {
        prov = new dcsspProvider<void *>(numProcesses);
        threadData = new __rq_thread_data[numProcesses];
        DEBUG_INIT_RQPROVIDER(numProcesses);
#ifdef COUNT_CODE_PATH_EXECUTIONS
        for (int i=0;i<CODE_COVERAGE_MAX_PATHS;++i) {
            codePathExecutions[i] = 0;
        }
#endif
    }
    
    ~RQProvider() {
#ifdef COUNT_CODE_PATH_EXECUTIONS
        cout<<"code path executions:";
        for (int i=0;i<CODE_COVERAGE_MAX_PATHS;++i) {
            if (codePathExecutions[i]) {
                cout<<" "<<codePathExecutions[i];
            } else {
                cout<<" .";
            }
        }
        cout<<endl;
#endif
//        for (int tid=0;tid<NUM_PROCESSES;++tid) {
//            prov->deinitThread(tid);
//            threadData[tid].hashlist->destroy();
//            delete threadData[tid].hashlist;
//        }
        
        prov->debugPrint();
        delete prov;
        delete[] threadData;
        DEBUG_DEINIT_RQPROVIDER(NUM_PROCESSES);
    }
    
    // invoke before a given thread can invoke any functions on this object
    void initThread(const int tid) {
        if (init[tid]) return; else init[tid] = !init[tid];

        prov->initThread(tid);
        threadData[tid].hashlist = new HashList<K>();
        threadData[tid].hashlist->init(HASHLIST_INIT_CAPACITY_POW2);
        threadData[tid].numAnnouncements = 0;
        for (int i=0;i<MAX_NODES_DELETED_ATOMICALLY;++i) {
            threadData[tid].announcements[i] = NULL;
        }
#ifdef COUNT_CODE_PATH_EXECUTIONS
        for (int i=0;i<CODE_COVERAGE_MAX_PATHS;++i) {
            threadData[tid].codePathExecutions[i] = 0;
        }
#endif
        DEBUG_INIT_THREAD(tid);
    }

    // invoke once a given thread will no longer invoke any functions on this object
    void deinitThread(const int tid) {
        if (!init[tid]) return; else init[tid] = !init[tid];

        prov->deinitThread(tid);
        threadData[tid].hashlist->destroy();
        delete threadData[tid].hashlist;
#ifdef COUNT_CODE_PATH_EXECUTIONS
        for (int i=0;i<CODE_COVERAGE_MAX_PATHS;++i) {
            __sync_fetch_and_add(&codePathExecutions[i], threadData[tid].codePathExecutions[i]);
        }
#endif
        DEBUG_DEINIT_THREAD(tid);
    }

    // invoke whenever a new node is created/initialized
    inline void init_node(const int tid, NodeType * const node) {
        node->itime = TIMESTAMP_NOT_SET;
        node->dtime = TIMESTAMP_NOT_SET;
    }

    // for each address addr that is modified by rq_linearize_update_at_write
    // or rq_linearize_update_at_cas, you must replace any initialization of addr
    // with invocations of rq_write_addr
    //
    // NOTE: this CANNOT be used on fields that might be concurrently being modified
    // by an invocation of rq_linearize_update_at_write or
    // rq_linearize_update_at_cas
    template <typename T>
    inline void write_addr(const int tid, T volatile * const addr, const T val) {
        if (is_pointer<T>::value) {
            prov->writePtr((casword_t *) addr, (casword_t) val);
        } else {
            prov->writeVal((casword_t *) addr, (casword_t) val);
        }
    }

    // for each address addr that is modified by rq_linearize_update_at_write
    // or rq_linearize_update_at_cas, you must replace any reads of addr with
    // invocations of rq_read_addr
    template <typename T>
    inline T read_addr(const int tid, T volatile * const addr) {
        return (T) ((is_pointer<T>::value)
                ? prov->readPtr(tid, (casword_t *) addr)
                : prov->readVal(tid, (casword_t *) addr));
    }
    
    // IF DATA STRUCTURE PERFORMS LOGICAL DELETION
    // run some time BEFORE the physical deletion of a node
    // whose key has ALREADY been logically deleted.
    void announce_physical_deletion(const int tid, NodeType * const * const deletedNodes) {
        int i;
        for (i=0;deletedNodes[i];++i) {
            threadData[tid].announcements[threadData[tid].numAnnouncements+i] = deletedNodes[i];
        }
        SOFTWARE_BARRIER;
        threadData[tid].numAnnouncements += i;
        assert(threadData[tid].numAnnouncements <= MAX_NODES_DELETED_ATOMICALLY);
        SOFTWARE_BARRIER;
    }

    // IF DATA STRUCTURE PERFORMS LOGICAL DELETION
    // run AFTER performing announce_physical_deletion,
    // if the cas that was trying to physically delete node failed.
    void physical_deletion_failed(const int tid, NodeType * const * const deletedNodes) {
        for (int i=0;deletedNodes[i];++i) {
            --threadData[tid].numAnnouncements;
        }
        assert(threadData[tid].numAnnouncements >= 0);
    }
    
    // IF DATA STRUCTURE PERFORMS LOGICAL DELETION
    // run AFTER performing announce_physical_deletion,
    // if the cas that was trying to physically delete node succeeded.
    void physical_deletion_succeeded(const int tid, NodeType * const * const deletedNodes) {
        int i;
        for (i=0;deletedNodes[i];++i) {
            recmgr->retire(tid, deletedNodes[i]);
        }
        SOFTWARE_BARRIER; // ensure nodes are placed in the epoch bag BEFORE they are removed from announcements.
        threadData[tid].numAnnouncements -= i;
        assert(threadData[tid].numAnnouncements >= 0);
    }
    
private:
    
    inline void set_insertion_timestamps(
            const int tid,
            const long long ts,
            NodeType * const * const insertedNodes,
            NodeType * const * const deletedNodes) {
        
        // set insertion timestamps
        // for each i_node in insertedNodes
        for (int i_nodeix=0;insertedNodes[i_nodeix];++i_nodeix) {
            insertedNodes[i_nodeix]->itime = ts;
        }
    }

    inline void set_deletion_timestamps(
            const int tid,
            const long long ts,
            NodeType * const * const insertedNodes,
            NodeType * const * const deletedNodes) {
        
        // set deletion timestamps
        // for each d_node in deletedNodes
        for (int d_nodeix=0;deletedNodes[d_nodeix];++d_nodeix) {
            deletedNodes[d_nodeix]->dtime = ts;
        }
    }
    
public:
    
    // replace the linearization point of an update that inserts or deletes nodes
    // with an invocation of this function if the linearization point is a WRITE
    template <typename T>
    inline T linearize_update_at_write(
            const int tid,
            T volatile * const lin_addr,
            const T& lin_newval,
            NodeType * const * const insertedNodes,
            NodeType * const * const deletedNodes) {

        if (!logicalDeletion) {
            // physical deletion will happen at the same time as logical deletion
            announce_physical_deletion(tid, deletedNodes);
        }
        
        casword_t old1;
        while (true) {
            old1 = (casword_t) timestamp;

            casword_t old2 = (is_pointer<T>::value)
                    ? (casword_t) prov->readPtr(tid, (casword_t *) lin_addr)
                    : (casword_t) prov->readVal(tid, (casword_t *) lin_addr);
            casword_t new2 = (casword_t) lin_newval;
            dcsspresult_t result = (is_pointer<T>::value)
                    ? prov->dcsspPtr(tid, (casword_t *) &timestamp, old1, (casword_t *) lin_addr, old2, new2, (void **) insertedNodes, (void **) deletedNodes)
                    : prov->dcsspVal(tid, (casword_t *) &timestamp, old1, (casword_t *) lin_addr, old2, new2, (void **) insertedNodes, (void **) deletedNodes);
            if (result.status == DCSSP_SUCCESS) {
                break;
            }
        }
        //DELAY_UP_TO(10000);

        set_insertion_timestamps(tid, old1 /* timestamp */, insertedNodes, deletedNodes);
        set_deletion_timestamps(tid, old1 /* timestamp */, insertedNodes, deletedNodes);
        
        // discard the payloads (insertedNodes and deletedNodes) in this thread's descriptor
        // so other threads can't access them far in the future if we become QUIESCENT and sleep for a long time
        // (must be performed after setting itimes and dtimes, but before enterQuiescentState)
        prov->discardPayloads(tid);

        if (!logicalDeletion) {
            // physical deletion will happen at the same time as logical deletion
            physical_deletion_succeeded(tid, deletedNodes);
        }
        
#if defined USE_RQ_DEBUGGING
        DEBUG_RECORD_UPDATE_CHECKSUM<K,V>(tid, old1 /* timestamp */, insertedNodes, deletedNodes, ds);
#endif
        return lin_newval;
    }

    // replace the linearization point of an update that inserts or deletes nodes
    // with an invocation of this function if the linearization point is a CAS
    template <typename T>
    inline T linearize_update_at_cas(
            const int tid,
            T volatile * const lin_addr,
            const T& lin_oldval,
            const T& lin_newval,
            NodeType * const * const insertedNodes,
            NodeType * const * const deletedNodes) {

        if (!logicalDeletion) {
            // physical deletion will happen at the same time as logical deletion
            announce_physical_deletion(tid, deletedNodes);
        }
        
        casword_t old2 = (casword_t) lin_oldval;
        casword_t new2 = (casword_t) lin_newval;
        dcsspresult_t result;
        while (true) {
            casword_t old1 = (casword_t) timestamp;

            result = (is_pointer<T>::value)
                    ? prov->dcsspPtr(tid, (casword_t *) &timestamp, old1 /* timestamp */, (casword_t *) lin_addr, old2, new2, (void **) insertedNodes, (void **) deletedNodes)
                    : prov->dcsspVal(tid, (casword_t *) &timestamp, old1 /* timestamp */, (casword_t *) lin_addr, old2, new2, (void **) insertedNodes, (void **) deletedNodes);
            if (result.status == DCSSP_SUCCESS) {
                //DELAY_UP_TO(1000);

                set_insertion_timestamps(tid, old1 /* timestamp */, insertedNodes, deletedNodes);
                set_deletion_timestamps(tid, old1 /* timestamp */, insertedNodes, deletedNodes);

                // discard the payloads (insertedNodes and deletedNodes) in this thread's descriptor
                // so other threads can't access them far in the future if we become QUIESCENT and sleep for a long time
                // (must be performed after setting itimes and dtimes, but before enterQuiescentState)
                prov->discardPayloads(tid);
                
                if (!logicalDeletion) {
                    // physical deletion will happen at the same time as logical deletion
                    physical_deletion_succeeded(tid, deletedNodes);
                }             
                
#if defined USE_RQ_DEBUGGING
                DEBUG_RECORD_UPDATE_CHECKSUM<K,V>(tid, old1 /* timestamp */, insertedNodes, deletedNodes, ds);
#endif
                return lin_oldval;
            } else if (result.status == DCSSP_FAILED_ADDR2) {
                // failed due to original CAS's failure (NOT due to the timestamp changing)

                if (!logicalDeletion) {
                    // physical deletion will happen at the same time as logical deletion
                    physical_deletion_failed(tid, deletedNodes);
                }
                
                break;
            }
        }
        assert(result.status == DCSSP_FAILED_ADDR2);
        assert(old2 != result.failed_val);
        return (T) result.failed_val;
    }

    // invoke at the start of each traversal
    inline void traversal_start(const int tid) {
        threadData[tid].hashlist->clear();
        threadData[tid].rq_lin_time = __sync_add_and_fetch(&timestamp, 1);      // linearize rq here!
    }

private:
    // invoke each time a traversal visits a node with a key in the desired range:
    // if the node belongs in the range query, it will be placed in rqResult[index]
    inline int __traversal_try_add(const int tid, NodeType * const node, K * const outputKeys, V * const outputValues, const K& lo, const K& hi, bool foundDuringTraversal) {

        // rqResultKeys should have space for MAX_KEYS_PER_NODE keys, AT LEAST
        
        // in the following, rather than having deeply nested if-else blocks,
        // we return asap, and list facts that must be true if we didn't return
        assert(foundDuringTraversal || !logicalDeletion || ds->isLogicallyDeleted(tid, node));

        // TODO: ensure this makes sense when called with announced nodes
        
        long long itime = node->itime;
        if (itime != TIMESTAMP_NOT_SET & node->itime >= threadData[tid].rq_lin_time) return 0; // node was inserted after the range query
        // fact: either itime was not set above, or node was inserted before rq
        
        ///////////////////////// HANDLE UNKNOWN ITIME /////////////////////////

        // TODO: try adding a bit of spinning before falling back to the full lock-free solution
        
        // determine if any other process inserted, or is trying to insert node, and, if so, when
        for (int otherTid=0; (itime = node->itime) == TIMESTAMP_NOT_SET && otherTid<NUM_PROCESSES; ++otherTid) if (otherTid != tid) {
            tagptr_t tagptr = prov->getDescriptorTagptr(otherTid);              // try to get a snapshot of otherTid's dcssp descriptor
            dcsspdesc_t<void *> snap;
            if (!prov->getDescriptorSnapshot(tagptr, &snap)) {                  // we failed to obtain a snapshot, which means that while getDescriptorSnapshot() was running, the process finished one dcssp, and started a new dcssp.
                continue; // goto check next process                            // if the finished dcssp inserted node, then before the next dcssp by the same process, node->itime is set. so, we check whether itime is set.
            }
            // fact: we obtained a snapshot

            if (!SNAPSHOT_CONTAINS_INSERTED_NODE(snap, node)) continue; // goto check next process
            // fact: otherTid is trying/tried to insert node

            int state = MUTABLES_UNPACK_FIELD(snap.mutables, DCSSP_MUTABLES_MASK_STATE, DCSSP_MUTABLES_OFFSET_STATE);
            if (state == DCSSP_STATE_FAILED) {                                  // the operation described by snap did not insert node, so either this process did not insert it, or the process inserted/inserted it in a PREVIOUS operation, so it must have already set itime as appropriate
                continue; // goto check next process
            } else if (state == DCSSP_STATE_SUCCEEDED) {                        // the dcssp operation finished, and inserted node. to determine WHEN it inserted node, we look at the argument old1 to the dcssp, which contains the timestamp when the dcssp took place. (observe that this is the value process otherTid would write to node->itime)
                if (snap.old1 >= threadData[tid].rq_lin_time) return 0;         // node was inserted after rq
                break; // process inserted node at time snap.old1, BEFORE the RQ
            }
            // fact: state is UNDECIDED
            
            // now we try to help
            casword_t addr2 = *snap.addr2;
            if (addr2 == tagptr) {                                              // addr2 indeed points to the dcssp descriptor. the linearization point of the dcssp operation occurs after this step, so the dcssp might have been linearized, but not yet had its state set.
                prov->helpProcess(tid, otherTid);                               // we need to know what its final state will be to determine whether it successfully inserted node. so, we HELP otherTid finish its dcssp.
            }
            // note: the following all happens in BOTH the cases where addr2 == tagptr and where addr2 != tagptr, except there is some extra work if addr2 != tagptr and state2 != SUCCEEDED. i've folded the two cases together simply for compactness / less repetition.

            // then, we reread the state
            bool valid = false;
            dcsspdesc_t<void *> * ptr = prov->getDescriptorPtr(tagptr);
            int state2 = DESC_READ_FIELD(valid, ptr->mutables, tagptr, DCSSP_MUTABLES_MASK_STATE, DCSSP_MUTABLES_OFFSET_STATE);
            if (!valid) continue; // goto check next process                    // the read of the state field was invalid, which means that the dcssp operation has terminated, the next dcssp operation by otherTid has begun. since the next dcssp operation has begun, and each high-level data structure operation performs only one successful dcssp (in a call to linearize_update_at_...), if this dcssp that finished in fact inserted node, then the next dcssp would be part of the next high-level operation. thus, if node->itime was inserted by the finished dcssp, then the high-level operation that performed this dcssp will already have set node->itime.
            // fact: the read of state was valid
            
            if (state2 == DCSSP_STATE_SUCCEEDED) {                              // we are in case (b) described above. the dcssp operation finished, and insert node. to determine WHEN it inserted node, we look at the argument old1 to the dcssp, which contains the timestamp when the dcssp took place. (observe that this is the value process otherTid would write to node->itime)
                if (snap.old1 >= threadData[tid].rq_lin_time) return 0;         // node was inserted after rq
                break; // process inserted node at time snap.old1, BEFORE the RQ
            } else { // undecided or failed
                continue; // goto check next process                            // we are in case (a) or case (c) described above. so, otherTid did NOT delete node.
            }
        }
        if (itime != TIMESTAMP_NOT_SET && itime >= threadData[tid].rq_lin_time) return 0; // node was inserted after rq

        /////////////// HANDLE LOGICAL DELETION AND CHECK DTIME ////////////////
        
        long long dtime = TIMESTAMP_NOT_SET;

        if (!logicalDeletion && foundDuringTraversal) goto tryAddToRQ;          // no logical deletion. since node was inserted before the range query, and the traversal encountered it, it must have been deleted AFTER the traversal encountered it.
        // fact: no logical deletion ==> did not find node during traversal

        dtime = node->dtime;
        if (dtime != TIMESTAMP_NOT_SET && dtime < threadData[tid].rq_lin_time) return 0;
        // fact: either dtime was not set above, or node was deleted after rq

        if (logicalDeletion && !ds->isLogicallyDeleted(tid, node)) goto tryAddToRQ; // if logical deletion is used with marking, the fact that node was inserted before the range query, and that the traversal encountered node, is NOT enough to argue that node was in the data structure when the traversal started. why? when the traversal encountered node, it might have already been marked. so, we check if node is marked. if not, then the node has not yet been deleted.
        // fact: logical deletion ==> node has been logically deleted
        
        ///////////////////////// HANDLE UNKNOWN DTIME /////////////////////////
        
        // determine if any other process is trying/tried to delete node
        for (int otherTid=0; (dtime = node->dtime) == TIMESTAMP_NOT_SET && otherTid<NUM_PROCESSES; ++otherTid) if (otherTid != tid) {
            tagptr_t tagptr = prov->getDescriptorTagptr(otherTid);              // try to get a snapshot of otherTid's dcssp descriptor
            dcsspdesc_t<void *> snap;
            if (!prov->getDescriptorSnapshot(tagptr, &snap)) {                  // we failed to obtain a snapshot, which means that while getDescriptorSnapshot() was running, the process finished one dcssp, and started a new dcssp.
                continue; // goto check next process                            // if the finished dcssp deleted node, then before the next dcssp by the same process, node->dtime is set (and it will be seen after the loop).
            }
            // fact: we obtained a snapshot

            if (!SNAPSHOT_CONTAINS_DELETED_NODE(snap, node)) continue; // goto check next process
            // fact: otherTid is trying/tried to delete node

            // we must determine whether otherTid's dcssp operation (whose descriptor we obtained a snapshot of) has been linearized, and whether it was successful.
            // we use the following facts.
            // (1) the dcssp descriptor has a state that is initially UNDECIDED, and becomes SUCCEEDED or FAILED after the dcssp has been linearized.
            // (2) a dcssp that succeeds or fails changes *snap.addr2 to tagptr, then reads *snap.addr1 and linearizes, then sets its state to SUCCEEDED or FAILED, then changes *snap.addr2 from tagptr to another value.
            // (3) once *snap.addr2 has been changed from tagptr to another value, it can never again contain tagptr.
            // (4) each high-level data structure operation invokes dcssp only via linearize_update_at_write or linearize_update_at_cas, and only performs one invocation of linearize_update_at_write or one /successful/ invocation of linearize_update_at_cas (and possibly many unsuccessful invocations of linearize_update_at_cas).
            // (5) if a dcssp operation deletes node, then before the next dcssp by the same process, node->dtime is set.
            // so, we check the state of the dcssp operation. if it is SUCCEEDED or FAILED, we have our answer. but, if the state is UNDECIDED, the dcssp may or may not have been linearized.
            // in the latter case, to determine whether it has been linearized, we would like to HELP the dcssp operation to complete.
            // note, however, that the help procedure for the dcssp algorithm can be invoked only if otherTid has already changed *snap.addr2 to tagptr.
            // thus, we must determine whether otherTid has changed *snap.addr2 to tagptr, before we can help the dcssp operation.
            // so, we read *snap.addr2. if we see that it contains tagptr, then we can help the dcssp.
            // otherwise, one of the following must be true:
            // (a) otherTid has not yet changed *snap.addr2 to tagptr, or
            // (b) otherTid changed *snap.addr2 to tagptr, then it (or a helper) changed its state to SUCCEEDED, then changed *snap.addr2 to a different value (never again to contain tagptr), or
            // (c) otherTid changed *snap.addr2 to tagptr, then it (or a helper) changed its state to FAILED, then changed *snap.addr2 to a different value (never again to contain tagptr).
            // in case (a), we know that the dcssp has not yet been linearized.
            // in case (b), the dcssp has been linearized, has state SUCCEEDED, and deleted node.
            // in case (c), the dcssp has been linearized, has state FAILED, and did NOT delete node.
            // so, after reading *snap.addr2, we read the dcssp operation's state again to determine which case has occurred.

            int state = MUTABLES_UNPACK_FIELD(snap.mutables, DCSSP_MUTABLES_MASK_STATE, DCSSP_MUTABLES_OFFSET_STATE);
            if (state == DCSSP_STATE_FAILED) {                                  // the operation described by snap did not insert/delete node, so either this process did not insert/delete it, or the process inserted/deleted it in a PREVIOUS operation, so it must have already set itime/dtime as appropriate
                continue; // goto check next process
            } else if (state == DCSSP_STATE_SUCCEEDED) {                        // the dcssp operation finished, and deleted node. to determine WHEN it deleted node, we look at the argument old1 to the dcssp, which contains the timestamp when the dcssp took place. (observe that this is the value process otherTid would write to node->dtime)
                if (WAIT_FOR_DTIME(node)) {
                    // the following assertions are thread safe ONLY if WAIT_FOR_DTIME actually waits! (which is true only if it returns true, which is true only if RQ_LOCKFREE_WAITS_FOR_DTIME is defined)
                    assert(snap.old1 <= node->dtime);
                    assert((snap.old1 >= threadData[tid].rq_lin_time) == (node->dtime >= threadData[tid].rq_lin_time));
                    assert(foundDuringTraversal || node->dtime == snap.old1);
                    assert(!foundDuringTraversal || node->dtime == snap.old1);
                }
                if (snap.old1 < threadData[tid].rq_lin_time) return 0;          // node was deleted before rq
                goto tryAddToRQ; // node was deleted by this process after rq
            }
            // fact: state is UNDECIDED
            
            // maya: in logicalDeletion, since the node is marked the DCSS was successful and only the dtime is not yet set. Thus, UNDECIDED means that otherThread did not mark the node, it was some other thread and we can continue. 
            if (logicalDeletion) continue; // goto check next process

            // now we try to help
            casword_t addr2 = *snap.addr2;
            if (addr2 == tagptr) {                                              // TODO: prove it is impossible to execute this block with logical deletion (idea: since node is marked, either (1) otherTid marked it earlier with a DCSS whose state is SUCCEEDED, or (2) someone else marked node, so otherTid cannot successfully CAS addr2.)
                // addr2 indeed points to the dcssp descriptor. the linearization point of the dcssp operation occurs after this step, so the dcssp might have been linearized, but not yet had its state set.
                prov->helpProcess(tid, otherTid);                       // we need to know what its final state will be to determine whether it successfully deleted node. so, we HELP otherTid finish its dcssp.
            }
            // note: the following all happens in BOTH the cases where addr2 == tagptr and where addr2 != tagptr, except there is some extra work if addr2 != tagptr and state2 != SUCCEEDED. i've folded the two cases together simply for compactness / less repetition.

            // then, we reread the state
            bool valid = false;
            dcsspdesc_t<void *> * ptr = prov->getDescriptorPtr(tagptr);
            int state2 = DESC_READ_FIELD(valid, ptr->mutables, tagptr, DCSSP_MUTABLES_MASK_STATE, DCSSP_MUTABLES_OFFSET_STATE);
            if (!valid) continue; // goto check next process                    // the read of the state field was invalid, which means that the dcssp operation has terminated, the next dcssp operation by otherTid has begun. since the next dcssp operation has begun, and each high-level data structure operation performs only one successful dcssp (in a call to linearize_update_at_...), if this dcssp that finished in fact deleted node, then the next dcssp would be part of the next high-level operation. thus, if node->itime was inserted by the finished dcssp, then the high-level operation that performed this dcssp will already have set node->itime.
            // fact: the read of state was valid

            if (state2 == DCSSP_STATE_SUCCEEDED) {                              // we are in case (b) described above. the dcssp operation finished, and deleted node. to determine WHEN it deleted node, we look at the argument old1 to the dcssp, which contains the timestamp when the dcssp took place. (observe that this is the value process otherTid would write to node->dtime)
                if (snap.old1 >= threadData[tid].rq_lin_time) goto tryAddToRQ;
                return 0; // do not add to rq
            } else { // undecided or failed
                continue; // goto check next process                            // we are in case (a) or case (c) described above. so, otherTid did NOT delete node.
            }
        }
        if(dtime == TIMESTAMP_NOT_SET) {
            assert(!logicalDeletion); 
            assert(!foundDuringTraversal);
            goto tryAddToRQ; // no process deleted node before the range query
        }
        COUNT_CODE_PATH(9);
        if (dtime >= threadData[tid].rq_lin_time) goto tryAddToRQ;
        return 0; // do not add to rq

        ///////////////////// TRY TO ADD NODE'S KEYS TO RQ /////////////////////
        // note: this way of organizing this decision tree favors trees with fat multi-key nodes, because getKeys is delayed as long as possible.
        
tryAddToRQ:
        // fetch the node's keys that are in the set
        int cnt = ds->getKeys(tid, node, outputKeys, outputValues);
        assert(cnt < RQ_DEBUGGING_MAX_KEYS_PER_NODE);
        if (cnt == 0) return 0;                                                 // node doesn't contain any keys that are in the set and in the desired range
        
        // note: in the following loop, we shift keys in the outputKeys array left to eliminate any that ultimately should not be added to the range query
        int numNewKeys = 0;
        for (int i=0;i<cnt;++i) {                                               // decide whether key = outputKeys[i] should be in the range query
            if (!ds->isInRange(outputKeys[i], lo, hi)) goto doNotAddToRQ;       // key is NOT in the desired range
            if (threadData[tid].hashlist->contains(outputKeys[i])) goto doNotAddToRQ; // key is already in the range query
            outputKeys[numNewKeys] = outputKeys[i];                             // save this as a new key added to the RQ
            outputValues[numNewKeys] = outputValues[i];
            ++numNewKeys;

doNotAddToRQ: (0);
        }
        return numNewKeys;
    }

    inline void traversal_try_add(const int tid, NodeType * const node, K * const rqResultKeys, V * const rqResultValues, int * const startIndex, const K& lo, const K& hi, bool foundDuringTraversal) {
//#if defined MICROBENCH && !defined NDEBUG
//        assert(*startIndex < RQSIZE); // note: this assert is a hack. it should be *startIndex < size of rqResultKeys
//        if (*startIndex > RQSIZE) {
//            cout<<"ERROR: *startIndex="<<(*startIndex)<<" is unexpectedly greater than or equal to RQSIZE="<<RQSIZE<<" (lo="<<lo<<" hi="<<hi<<")"<<endl;
//            cout<<"results:";
//            for (int i=0;i<*startIndex;++i) {
//                cout<<" "<<rqResultKeys[i];
//            }
//            cout<<endl;
//            exit(-1);
//        }
//#endif
        int numNewKeys = __traversal_try_add(tid, node, rqResultKeys+(*startIndex), rqResultValues+(*startIndex), lo, hi, foundDuringTraversal);
//#if defined MICROBENCH
//        assert(*startIndex + numNewKeys < 2*RQSIZE); // note: this assert is a hack. it should be *startIndex + numNewKeys < size of rqResultKeys array
//#endif
        for (int i=0;i<numNewKeys;++i) {
            threadData[tid].hashlist->insert(rqResultKeys[(*startIndex)++]);
        }
        // note: the above increments startIndex
#if defined MICROBENCH
        assert(*startIndex <= RQSIZE);
#endif
    }
    
public:
    inline void traversal_try_add(const int tid, NodeType * const node, K * const rqResultKeys, V * const rqResultValues, int * const startIndex, const K& lo, const K& hi) {
        traversal_try_add(tid, node, rqResultKeys, rqResultValues, startIndex, lo, hi, true);
    }
    
    // invoke at the end of each traversal:
    // any nodes that were deleted during the traversal,
    // and were consequently missed during the traversal,
    // are placed in rqResult[index]
    void traversal_end(const int tid, K * const rqResultKeys, V * const rqResultValues, int * const startIndex, const K& lo, const K& hi) {
        // todo: possibly optimize by skipping entire blocks if there are many keys to skip (does not seem to be justifiable for 4 work threads and 4 range query threads)

        SOFTWARE_BARRIER;
        long long end_timestamp = timestamp;
        SOFTWARE_BARRIER;
        
#if 0
        vector<NodeType *> nodes;
        
        // collect nodes announced by other processes
        for (int otherTid=0;otherTid<NUM_PROCESSES;++otherTid) if (otherTid != tid) {
            int sz = threadData[otherTid].numAnnouncements;
            SOFTWARE_BARRIER;
            for (int i=0;i<sz;++i) {
                NodeType * node = (NodeType *) threadData[otherTid].announcements[i];
                assert(node);
                nodes.push_back(node);
//                traversal_try_add(tid, node, rqResultKeys, startIndex, lo, hi, false);
            }
        }
        SOFTWARE_BARRIER;
        
        // collect epoch bags of other processes (MUST be after checking announcements!)
        blockbag<NodeType> * all_bags[NUM_PROCESSES*NUMBER_OF_EPOCH_BAGS+1];
        vector<blockbag_iterator<NodeType>> all_iterators;
        int numIterators = 0;
        for (int otherTid=0;otherTid<NUM_PROCESSES;++otherTid) if (otherTid != tid) {
            blockbag<NodeType> * thread_bags[NUMBER_OF_EPOCH_BAGS+1];
            recmgr->get((NodeType *) NULL)->reclaim->getSafeBlockbags(otherTid, thread_bags);
            for (int i=0;thread_bags[i];++i) {
                all_bags[numIterators] = thread_bags[i];
                all_iterators.push_back(thread_bags[i]->begin());
                ++numIterators;
            }
        }

        // collect nodes in epoch bags
        int numVisitedInEpochBags = 0;
        for (int ix = 0; ix < numIterators; ++ix) {
            for (; all_iterators[ix] != all_bags[ix]->end(); all_iterators[ix]++) {
                NodeType * node = (*all_iterators[ix]);
                nodes.push_back(node);
                
                ++numVisitedInEpochBags;
                
                long long dtime = node->dtime;
                if (dtime != TIMESTAMP_NOT_SET && dtime > end_timestamp) continue;
                
                if (!(logicalDeletion && canRetireNodesLogicallyDeletedByOtherProcesses)) {
                    // if we cannot retire nodes that are logically deleted
                    // by other processes, then we always retire nodes in
                    // order of increasing dtime values.
                    // so, the blockbag will be ordered, which means that,
                    // if dtime is before the RQ, then all remaining nodes
                    // in this bag were deleted before the RQ.
                    // so, in this case, we skip to the next bag.
                    if (dtime != TIMESTAMP_NOT_SET && dtime < threadData[tid].rq_lin_time) break;
                }
            }
        }
        
        // visit collected nodes
        for (auto it = nodes.begin(); it != nodes.end(); it++) {
            NodeType * node = *it;
            traversal_try_add(tid, node, rqResultKeys, startIndex, lo, hi, false);
        }
#else
        // collect nodes announced by other processes
        for (int otherTid=0;otherTid<NUM_PROCESSES;++otherTid) if (otherTid != tid) {
            int sz = threadData[otherTid].numAnnouncements;
            SOFTWARE_BARRIER;
            for (int i=0;i<sz;++i) {
                NodeType * node = (NodeType *) threadData[otherTid].announcements[i];
                assert(node);
                traversal_try_add(tid, node, rqResultKeys, rqResultValues, startIndex, lo, hi, false);
            }
        }
        SOFTWARE_BARRIER;
        
        // collect epoch bags of other processes (MUST be after checking announcements!)
        blockbag<NodeType> * all_bags[NUM_PROCESSES*NUMBER_OF_EPOCH_BAGS+1];
        vector<blockbag_iterator<NodeType>> all_iterators;
        int numIterators = 0;
        for (int otherTid=0;otherTid<NUM_PROCESSES;++otherTid) if (otherTid != tid) {
            blockbag<NodeType> * thread_bags[NUMBER_OF_EPOCH_BAGS+1];
            recmgr->get((NodeType *) NULL)->reclaim->getSafeBlockbags(otherTid, thread_bags);
            for (int i=0;thread_bags[i];++i) {
                all_bags[numIterators] = thread_bags[i];
                all_iterators.push_back(thread_bags[i]->begin());
                ++numIterators;
            }
        }
        
        int numSkippedInEpochBags = 0;
        int numVisitedInEpochBags = 0;
        for (int ix = 0; ix < numIterators; ++ix) {
            for (; all_iterators[ix] != all_bags[ix]->end(); all_iterators[ix]++) {
                NodeType * node = (*all_iterators[ix]);
                assert(node);

                ++numVisitedInEpochBags;
                ++numSkippedInEpochBags;

                long long dtime = node->dtime;
                if (dtime != TIMESTAMP_NOT_SET && dtime > end_timestamp) continue;
                
                --numSkippedInEpochBags;

                if (!(logicalDeletion && canRetireNodesLogicallyDeletedByOtherProcesses)) {
                    // if we cannot retire nodes that are logically deleted
                    // by other processes, then we always retire nodes in
                    // order of increasing dtime values.
                    // so, the blockbag will be ordered, which means that,
                    // if dtime is before the RQ, then all remaining nodes
                    // in this bag were deleted before the RQ.
                    // so, in this case, we skip to the next bag.
                    if (dtime != TIMESTAMP_NOT_SET && dtime < threadData[tid].rq_lin_time) break;
                }

                traversal_try_add(tid, node, rqResultKeys, rqResultValues, startIndex, lo, hi, false);
            }
        }
        
#endif
        
#if defined MICROBENCH && !defined NDEBUG
        if (*startIndex > RQSIZE) {
            cout<<"ERROR: *startIndex="<<(*startIndex)<<" is unexpectedly greater than or equal to RQSIZE="<<RQSIZE<<" (lo="<<lo<<" hi="<<hi<<")"<<endl;
            cout<<"results:";
            for (int i=0;i<*startIndex;++i) {
                cout<<" "<<rqResultKeys[i];
            }
            cout<<endl;
            exit(-1);
        }
#endif

#ifdef __HANDLE_STATS

        GSTATS_ADD_IX(tid, skipped_in_bags, numSkippedInEpochBags, threadData[tid].rq_lin_time);
        GSTATS_ADD_IX(tid, visited_in_bags, numVisitedInEpochBags, threadData[tid].rq_lin_time);
#endif
        DEBUG_RECORD_RQ_VISITED(tid, threadData[tid].rq_lin_time, numVisitedInEpochBags);
        DEBUG_RECORD_RQ_SIZE(*startIndex);
        DEBUG_RECORD_RQ_CHECKSUM(tid, threadData[tid].rq_lin_time, rqResultKeys, *startIndex);
    }
};

#endif	/* RQ_DCSSP_H */


================================================
FILE: datastructures/trevor_brown_abtree/common/rq/rq_debugging.h
================================================
/* 
 * File:   rq_debugging.h
 * Author: trbot
 *
 * Created on May 15, 2017, 5:23 PM
 */

#ifndef RQ_DEBUGGING_H
#define RQ_DEBUGGING_H

#ifndef RQ_DEBUGGING_MAX_KEYS_PER_NODE
    #define RQ_DEBUGGING_MAX_KEYS_PER_NODE 32
#endif

#if !defined USE_RQ_DEBUGGING

    #define DEBUG_INIT_RQPROVIDER(x)
    #define DEBUG_VALIDATE_RQ(x)
    #define DEBUG_DEINIT_RQPROVIDER(x)
    #define DEBUG_INIT_THREAD(x)
    #define DEBUG_DEINIT_THREAD(x)
    #define DEBUG_RECORD_RQ_VISITED //
    #define DEBUG_RECORD_RQ_SIZE //
    #define DEBUG_RECORD_RQ_CHECKSUM //

#else

    #include <cassert>
    #include <string>
    #include <iostream>
    using namespace std;
    
    #define MAX_NUM_RQ_IN_EXECUTION (1<<20)

    #ifdef RQ_VISITED_IN_BAGS_HISTOGRAM
        #include <sstream>
        string twoDigits(int x) {
            stringstream ss;
            if (x >= 0 && x < 10) {
                ss<<"0";
            }
            ss<<x;
            return ss.str();
        }

        template <typename T>
        void printLogarithmicHistogram(T * valuesOverTime, int numValues) {
            constexpr int numBits = sizeof(T)*8;
            int histogram[numBits+1];
            memset(histogram, 0, sizeof(histogram));
            T sum = 0;
            int cntNonZero = 0;
            for (int i=0;i<numValues;++i) {
                T v = valuesOverTime[i];
                if (v == 0) continue;
                sum += v;
                ++cntNonZero;
                int pow2 = 0;
                while (v > 1) {
                    v >>= 1;
                    ++pow2;
                }
                assert(pow2 <= numBits);
                ++histogram[pow2];
            }
            for (int i=0;i<=numBits;++i) {
                if (histogram[i] > 0) {
                    cout<<"    (2^"<<twoDigits(i)<<", 2^"<<twoDigits(i+1)<<"]: "<<histogram[i]<<endl;
                }
            }
            cout<<"    average = "<<(sum / (double) cntNonZero)<<endl;
        }

        int ** threadNumNodesVisitedInBags; //[MAX_TID_POW2][MAX_NUM_RQ_IN_EXECUTION];
    #endif

    #ifdef RQ_VALIDATION
        #include <cstring>
        #include "errors.h"
        #define NO_RQ_CHECKSUM (0)
        long long ** threadUpdateChecksum; //[MAX_TID_POW2][MAX_NUM_RQ_IN_EXECUTION];
        long long ** threadRQChecksum; //[MAX_TID_POW2][MAX_NUM_RQ_IN_EXECUTION];
    #endif

    #ifdef RQ_HISTOGRAM
        #include <fstream>
        #define CSV_OUTPUT_FILE "data.csv"
        std::ofstream ofs;

        #define MAX_RQ_SIZE (1<<16)
        __thread int numRQs[MAX_RQ_SIZE+1];
        int totalNumRQs[MAX_RQ_SIZE+1];
    #endif

        inline void DEBUG_RECORD_RQ_VISITED(const int tid, const long long ts, const int numVisited) {
    #ifdef RQ_VISITED_IN_BAGS_HISTOGRAM
            if (ts >= MAX_NUM_RQ_IN_EXECUTION) return;
            threadNumNodesVisitedInBags[tid][ts] = numVisited;
    #endif
        }

        inline void DEBUG_RECORD_RQ_SIZE(const int size) {
    #ifdef RQ_HISTOGRAM
            ++numRQs[size];
    #endif
        }

        template <typename K, typename V, typename Node, class DataStructure>
        inline void DEBUG_RECORD_UPDATE_CHECKSUM(const int tid, const long long timestamp, Node * const * const insertedNodes, Node * const * const deletedNodes, DataStructure * const ds) {
    #ifdef RQ_VALIDATION
            if (timestamp >= MAX_NUM_RQ_IN_EXECUTION) {
                return;
    //            cout << "timestamp is: " << timestamp << endl;
    //            error("timestamp > MAX_NUM_RQ_IN_EXECUTION");
            }
            for (int i=0;insertedNodes[i];++i) {
                K outputKeys[RQ_DEBUGGING_MAX_KEYS_PER_NODE];
                V outputValues[RQ_DEBUGGING_MAX_KEYS_PER_NODE];
                int cnt = ds->getKeys(tid, insertedNodes[i], outputKeys, outputValues);
                assert(cnt <= RQ_DEBUGGING_MAX_KEYS_PER_NODE);
                for (int j=0;j<cnt;++j) {
                    threadUpdateChecksum[tid][timestamp] += outputKeys[j];
                }
            }
            for (int i=0;deletedNodes[i];++i) {
                K outputKeys[RQ_DEBUGGING_MAX_KEYS_PER_NODE];
                V outputValues[RQ_DEBUGGING_MAX_KEYS_PER_NODE];
                int cnt = ds->getKeys(tid, deletedNodes[i], outputKeys, outputValues);
                assert(cnt <= RQ_DEBUGGING_MAX_KEYS_PER_NODE);
                for (int j=0;j<cnt;++j) {
                    threadUpdateChecksum[tid][timestamp] -= outputKeys[j];
                }
            }
    #endif
        }

        template <typename K>
        inline void DEBUG_RECORD_RQ_CHECKSUM(const int tid, const long long timestamp, K const * const rqResult, const int len) {
    #ifdef RQ_VALIDATION
            if (timestamp >= MAX_NUM_RQ_IN_EXECUTION) return;
            //if (timestamp >= MAX_NUM_RQ_IN_EXECUTION) error("timestamp > MAX_NUM_RQ_IN_EXECUTION");
            // compute checksum
            long long checksum = 0;
            for (int i=0;i<len;++i) {
                checksum += (long long) rqResult[i];
            }
            threadRQChecksum[tid][timestamp] = checksum;
    #endif
        }

        void DEBUG_INIT_RQPROVIDER(const int numProcesses) {
    #ifdef RQ_HISTOGRAM
            ofs.open(CSV_OUTPUT_FILE, std::ofstream::out);
            for (int size=0;size<=MAX_RQ_SIZE;++size) {
                totalNumRQs[size] = 0;
            }
    #endif
    #ifdef RQ_VALIDATION
            threadUpdateChecksum = new long long * [numProcesses];
            threadRQChecksum = new long long * [numProcesses];
    #ifdef RQ_VISITED_IN_BAGS_HISTOGRAM
            threadNumNodesVisitedInBags = new int * [numProcesses];
    #endif
            for (int tid=0;tid<numProcesses;++tid) {
                threadUpdateChecksum[tid] = new long long[MAX_NUM_RQ_IN_EXECUTION];
                threadRQChecksum[tid] = new long long[MAX_NUM_RQ_IN_EXECUTION];
    #ifdef RQ_VISITED_IN_BAGS_HISTOGRAM
                threadNumNodesVisitedInBags[tid] = new int[MAX_NUM_RQ_IN_EXECUTION];
    #endif
                memset(threadUpdateChecksum[tid], 0, MAX_NUM_RQ_IN_EXECUTION*sizeof(threadUpdateChecksum[tid][0]));
                memset(threadRQChecksum[tid], 0, MAX_NUM_RQ_IN_EXECUTION*sizeof(threadRQChecksum[tid][0]));
    #ifdef RQ_VISITED_IN_BAGS_HISTOGRAM
                memset(threadNumNodesVisitedInBags[tid], 0, MAX_NUM_RQ_IN_EXECUTION*sizeof(threadNumNodesVisitedInBags[tid][0]));
    #endif
            }
    #endif
        }

        void DEBUG_VALIDATE_RQ(const int numProcesses) {
    #ifdef RQ_VALIDATION
            long long * updateChecksum = new long long[MAX_NUM_RQ_IN_EXECUTION];
            long long * rqChecksum = new long long[MAX_NUM_RQ_IN_EXECUTION];
            int * numNodesVisitedInBags = new int[MAX_NUM_RQ_IN_EXECUTION];
            memset(updateChecksum, 0, MAX_NUM_RQ_IN_EXECUTION*sizeof(updateChecksum[0]));
            memset(rqChecksum, 0, MAX_NUM_RQ_IN_EXECUTION*sizeof(rqChecksum[0]));
            memset(numNodesVisitedInBags, 0, MAX_NUM_RQ_IN_EXECUTION*sizeof(numNodesVisitedInBags[0]));

            for (int tid=0;tid<numProcesses;++tid) {
                for (int timestamp=0;timestamp<MAX_NUM_RQ_IN_EXECUTION;++timestamp) {
                    //if (threadUpdateChecksum[tid][timestamp]) cout<<"threadUpdateChecksum[tid="<<tid<<", timestamp="<<timestamp<<"]="<<threadUpdateChecksum[tid][timestamp]<<endl;
                    rqChecksum[timestamp] += threadRQChecksum[tid][timestamp];
                    updateChecksum[timestamp] += threadUpdateChecksum[tid][timestamp];
    #ifdef RQ_VISITED_IN_BAGS_HISTOGRAM
                    numNodesVisitedInBags[timestamp] += threadNumNodesVisitedInBags[tid][timestamp];
    #endif
                }
            } // note: since each rq gets a unique timestamp, rqChecksum only contains one RQ per timestamp, even after summing over all threads

            bool good = true;
            int numberFailed = 0;
            int numberSucc = 0;
    #ifndef RLU_USED
            long long prefixSum = 0; 
            for (int timestamp=0;timestamp<MAX_NUM_RQ_IN_EXECUTION;++timestamp) {
                if (rqChecksum[timestamp] != NO_RQ_CHECKSUM) {
                    if (rqChecksum[timestamp] != prefixSum) {
                        ++numberFailed;
                        if (numberFailed < 100) {
                            cout<<"RQ VALIDATION ERROR: rqChecksum[timestamp="<<timestamp<<"]="<<rqChecksum[timestamp]<<" is not equal to prefixSum=updateChecksum[0, 1, ..., timestamp-1]="<<prefixSum<<endl;
                        } else if (numberFailed == 100) {
                            cout<<"RQ VALIDATION: too many errors to list..."<<endl;
                        }
                        good = false;
                        //exit(-1);
                    } else {
                        ++numberSucc;
                    }
                }
                prefixSum += updateChecksum[timestamp];
            }
    #else       
            for (int tid=0;tid<numProcesses;++tid) {
                long long prefixSum = 0;
                for (int timestamp=0;timestamp<MAX_NUM_RQ_IN_EXECUTION;++timestamp) {
                    if (threadRQChecksum[tid][timestamp] != NO_RQ_CHECKSUM) {
                        if (threadRQChecksum[tid][timestamp] != prefixSum) {
                            ++numberFailed;
                            if (numberFailed < 100) {
                                cout<<"RQ VALIDATION ERROR: threadRQChecksum[tid="<< tid <<"][timestamp="<<timestamp<<"]="<<threadRQChecksum[tid][timestamp]<<" is not equal to prefixSum=updateChecksum[0, 1, ..., timestamp-1]="<<prefixSum<<endl;
                            } else if (numberFailed == 100) {
                                cout<<"RQ VALIDATION: too many errors to list..."<<endl;
                            }
                            good = false;
                            //exit(-1);
                        } else {
                            ++numberSucc;
                        }
                    }
                    prefixSum += updateChecksum[timestamp];
                }
            }
    #endif
            if (numberFailed > 0) {
                cout<<"RQ VALIDATION TOTAL FAILURES: "<<numberFailed<<endl;
                cout<<"    (note: validation only works for RQs over the entire data structure)"<<endl;
            }
            cout<<"RQ VALIDATION TOTAL SUCCESSES: "<<numberSucc<<endl;
            cout<<"    (note: this captures only non-empty RQs, and is at most "<<MAX_NUM_RQ_IN_EXECUTION<<")"<<endl;
            if (good) cout<<"RQ Validation OK"<<endl;
            cout<<endl;

    #ifdef RQ_VISITED_IN_BAGS_HISTOGRAM
            cout<<"histogram: how many RQs visited x nodes in limbo bags?"<<endl;
            printLogarithmicHistogram(numNodesVisitedInBags, MAX_NUM_RQ_IN_EXECUTION);
            cout<<endl;
    #endif

            delete[] updateChecksum;
            delete[] rqChecksum;
            delete[] numNodesVisitedInBags;
    #endif
        }

        void DEBUG_DEINIT_RQPROVIDER(const int numProcesses) {
    #ifdef RQ_HISTOGRAM
            ofs<<"x,y"<<endl;
            for (int size=0;size<=MAX_RQ_SIZE;++size) {
                if (totalNumRQs[size]) {
                    ofs<<size<<","<<totalNumRQs[size]<<endl;
                }
            }
            ofs.close();
            long long __sum = 0;
            for (int size=0;size<=MAX_RQ_SIZE;++size) {
                __sum += totalNumRQs[size];
            }
    #endif
    #ifdef RQ_VALIDATION
            DEBUG_VALIDATE_RQ(numProcesses);
            for (int tid=0;tid<numProcesses;++tid) {
                delete[] threadUpdateChecksum[tid];
                delete[] threadRQChecksum[tid];
    #ifdef RQ_VISITED_IN_BAGS_HISTOGRAM
                delete[] threadNumNodesVisitedInBags[tid];
    #endif
            }
            delete[] threadUpdateChecksum;
            delete[] threadRQChecksum;
    #ifdef RQ_VISITED_IN_BAGS_HISTOGRAM
            delete[] threadNumNodesVisitedInBags;
    #endif
    #endif
        }

        void DEBUG_INIT_THREAD(const int tid) {
    #ifdef RQ_HISTOGRAM
            for (int size=0;size<=MAX_RQ_SIZE;++size) {
                numRQs[size] = 0;
            }
    #endif
        }

        void DEBUG_DEINIT_THREAD(const int tid) {
    #ifdef RQ_HISTOGRAM
            for (int size=0;size<=MAX_RQ_SIZE;++size) {
                __sync_fetch_and_add(&totalNumRQs[size], numRQs[size]);
            }
    #endif
        }

#endif // else case for "if !defined USE_RQ_DEBUGGING"
    
#endif /* RQ_DEBUGGING_H */

================================================
FILE: datastructures/trevor_brown_abtree/common/rq/rq_htm_rwlock.h
================================================
/* 
 * File:   rq_rwlock.h
 * Author: trbot
 *
 * Created on April 20, 2017, 1:03 PM
 */

#ifndef RQ_RWLOCK_H
#define	RQ_RWLOCK_H

#define MAX_NODES_DELETED_ATOMICALLY 8
#define MAX_KEYS_PER_NODE 32

#include "rq_debugging.h"
#include <hashlist.h>
#include <rwlock.h>
#include <rtm.h>
#include <pthread.h>
#include <cassert>

#define MAX_HTM_ATTEMPTS 30

#define dosum(src) ({ \
    long long __sum = 0; \
    for (int __i=0;__i<NUM_PROCESSES;++__i) { \
        __sum += threadData[__i].src; \
    } \
    __sum; \
})

template <typename K, typename V, typename NodeType, typename DataStructure, typename RecordManager, bool logicalDeletion, bool canRetireNodesLogicallyDeletedByOtherProcesses>
class RQProvider {
private:
    struct __rq_thread_data {
        #define __RQ_THREAD_DATA_SIZE 1024
        union {
            struct { // anonymous struct inside anonymous union means we don't need to type anything special to access these variables
                long long rq_lin_time;
                HashList<K> * hashlist;
                volatile char padding0[PREFETCH_SIZE_BYTES];
                void * announcements[MAX_NODES_DELETED_ATOMICALLY+1];
                int numAnnouncements;
                volatile char padding1[PREFETCH_SIZE_BYTES]; // prevent false sharing between htm debugging stats below and announcements
                
                // htm debugging stuff
                int commitWriter;
                int abortWriter;
                int commitReader;
                int abortReader;
                int fallback;
            };
            char bytes[__RQ_THREAD_DATA_SIZE]; // avoid false sharing
        };
    } __attribute__((aligned(__RQ_THREAD_DATA_SIZE)));

    #define TIMESTAMP_NOT_SET 0
    #define HASHLIST_INIT_CAPACITY_POW2 (1<<8)

    const int NUM_PROCESSES;
    volatile char padding0[PREFETCH_SIZE_BYTES];
    volatile long long timestamp = 1;
    volatile char padding1[PREFETCH_SIZE_BYTES];
    RWLock rwlock;
    volatile char padding2[PREFETCH_SIZE_BYTES];
    __rq_thread_data * threadData;
    
    DataStructure * ds;
    RecordManager * const recmgr;

    int init[MAX_TID_POW2] = {0,};

public:
    RQProvider(const int numProcesses, DataStructure * ds, RecordManager * recmgr) : NUM_PROCESSES(numProcesses), ds(ds), recmgr(recmgr) {
        threadData = new __rq_thread_data[numProcesses];
        DEBUG_INIT_RQPROVIDER(numProcesses);
    }

    ~RQProvider() {
        cout<<"writer commits : "<<dosum(commitWriter)<<endl;
        cout<<"writer aborts  : "<<dosum(abortWriter)<<endl;
        cout<<"reader commits : "<<dosum(commitReader)<<endl;
        cout<<"reader aborts  : "<<dosum(abortReader)<<endl;
        cout<<"fallback       : "<<dosum(fallback)<<endl;

//        for (int tid=0;tid<NUM_PROCESSES;++tid) {
//            threadData[tid].hashlist->destroy();
//            delete threadData[tid].hashlist;
//        }
        delete[] threadData;
        DEBUG_DEINIT_RQPROVIDER(NUM_PROCESSES);
    }

    // invoke before a given thread can perform any rq_functions
    void initThread(const int tid) {
        if (init[tid]) return; else init[tid] = !init[tid];

        threadData[tid].hashlist = new HashList<K>();
        threadData[tid].hashlist->init(HASHLIST_INIT_CAPACITY_POW2);
        threadData[tid].numAnnouncements = 0;
        for (int i=0;i<MAX_NODES_DELETED_ATOMICALLY+1;++i) {
            threadData[tid].announcements[i] = NULL;
        }
        threadData[tid].commitWriter = 0;
        threadData[tid].abortWriter = 0;
        threadData[tid].commitReader = 0;
        threadData[tid].abortReader = 0;
        threadData[tid].fallback = 0;
        DEBUG_INIT_THREAD(tid);
    }

    // invoke once a given thread will no longer perform any rq_ functions
    void deinitThread(const int tid) {
        if (!init[tid]) return; else init[tid] = !init[tid];

        threadData[tid].hashlist->destroy();
        delete threadData[tid].hashlist;
        DEBUG_DEINIT_THREAD(tid);
    }

    // invoke whenever a new node is created/initialized
    inline void init_node(const int tid, NodeType * const node) {
        node->itime = TIMESTAMP_NOT_SET;
        node->dtime = TIMESTAMP_NOT_SET;
    }

    // for each address addr that is modified by rq_linearize_update_at_write
    // or rq_linearize_update_at_cas, you must replace any initialization of addr
    // with invocations of rq_write_addr
    template <typename T>
    inline void write_addr(const int tid, T volatile * const addr, const T val) {
        *addr = val;
    }

    // for each address addr that is modified by rq_linearize_update_at_write
    // or rq_linearize_update_at_cas, you must replace any reads of addr with
    // invocations of rq_read_addr
    template <typename T>
    inline T read_addr(const int tid, T volatile * const addr) {
        return *addr;
    }

    // IF DATA STRUCTURE PERFORMS LOGICAL DELETION
    // run some time BEFORE the physical deletion of a node
    // whose key has ALREADY been logically deleted.
    void announce_physical_deletion(const int tid, NodeType * const * const deletedNodes) {
        int i;
        for (i=0;deletedNodes[i];++i) {
            threadData[tid].announcements[threadData[tid].numAnnouncements+i] = deletedNodes[i];
        }
        SOFTWARE_BARRIER;
        threadData[tid].numAnnouncements += i;
        assert(threadData[tid].numAnnouncements <= MAX_NODES_DELETED_ATOMICALLY);
        SOFTWARE_BARRIER;
    }

    // IF DATA STRUCTURE PERFORMS LOGICAL DELETION
    // run AFTER performing announce_physical_deletion,
    // if the cas that was trying to physically delete node failed.
    void physical_deletion_failed(const int tid, NodeType * const * const deletedNodes) {
        for (int i=0;deletedNodes[i];++i) {
            --threadData[tid].numAnnouncements;
        }
        assert(threadData[tid].numAnnouncements >= 0);
    }
    
    // IF DATA STRUCTURE PERFORMS LOGICAL DELETION
    // run AFTER performing announce_physical_deletion,
    // if the cas that was trying to physically delete node succeeded.
    void physical_deletion_succeeded(const int tid, NodeType * const * const deletedNodes) {
        int i;
        for (i=0;deletedNodes[i];++i) {
            recmgr->retire(tid, deletedNodes[i]);
        }
        SOFTWARE_BARRIER; // ensure nodes are placed in the epoch bag BEFORE they are removed from announcements.
        threadData[tid].numAnnouncements -= i;
        assert(threadData[tid].numAnnouncements >= 0);
    }
    
private:
    
    inline void set_insertion_timestamps(
            const int tid,
            const long long ts,
            NodeType * const * const insertedNodes,
            NodeType * const * const deletedNodes) {
        
        // set insertion timestamps
        // for each i_node in insertedNodes
        for (int i_nodeix=0;insertedNodes[i_nodeix];++i_nodeix) {
            insertedNodes[i_nodeix]->itime = ts;
        }
    }

    inline void set_deletion_timestamps(
            const int tid,
            const long long ts,
            NodeType * const * const insertedNodes,
            NodeType * const * const deletedNodes) {
        
        // set deletion timestamps
        // for each d_node in deletedNodes
        for (int d_nodeix=0;deletedNodes[d_nodeix];++d_nodeix) {
            deletedNodes[d_nodeix]->dtime = ts;
        }
    }
    
public:

    // replace the linearization point of an update that inserts or deletes nodes
    // with an invocation of this function if the linearization point is a WRITE
    template <typename T>
    inline T linearize_update_at_write(
            const int tid,
            T volatile * const lin_addr,
            const T& lin_newval,
            NodeType * const * const insertedNodes,
            NodeType * const * const deletedNodes) {

        if (!logicalDeletion) {
            // physical deletion will happen at the same time as logical deletion
            announce_physical_deletion(tid, deletedNodes);
        }

        // htm path
        long long ts;
        int limit = MAX_HTM_ATTEMPTS;
        while (limit--) {
            while (rwlock.isWriteLocked()) {}
            if (XBEGIN() == _XBEGIN_STARTED) {
                if (rwlock.isWriteLocked()) XABORT(1);
                ts = timestamp;
                *lin_addr = lin_newval; // original linearization point
                XEND();
                ++threadData[tid].commitReader;
                goto committed;
            } else ++threadData[tid].abortReader;

        }
        
        // fallback path
        ++threadData[tid].fallback;
        rwlock.readLock();
        ts = timestamp;
        *lin_addr = lin_newval; // original linearization point
        rwlock.readUnlock();

committed:
        set_insertion_timestamps(tid, ts, insertedNodes, deletedNodes);
        set_deletion_timestamps(tid, ts, insertedNodes, deletedNodes);
        
        if (!logicalDeletion) {
            // physical deletion will happen at the same time as logical deletion
            physical_deletion_succeeded(tid, deletedNodes);
        }
        
#if defined USE_RQ_DEBUGGING
        DEBUG_RECORD_UPDATE_CHECKSUM<K,V>(tid, ts, insertedNodes, deletedNodes, ds);
#endif
        return lin_newval;
    }
    
    // replace the linearization point of an update that inserts or deletes nodes
    // with an invocation of this function if the linearization point is a CAS
    template <typename T>
    inline T linearize_update_at_cas(
            const int tid,
            T volatile * const lin_addr,
            const T& lin_oldval,
            const T& lin_newval,
            NodeType * const * const insertedNodes,
            NodeType * const * const deletedNodes) {

        if (!logicalDeletion) {
            // physical deletion will happen at the same time as logical deletion
            announce_physical_deletion(tid, deletedNodes);
        }
        
        // htm path
        long long ts;
        T res;
        int limit = MAX_HTM_ATTEMPTS;
        while (limit--) {
            while (rwlock.isWriteLocked()) {}
            if (XBEGIN() == _XBEGIN_STARTED) {
                if (rwlock.isWriteLocked()) XABORT(1);
                ts = timestamp;
                res = __sync_val_compare_and_swap(lin_addr, lin_oldval, lin_newval); // original linearization point
//                res = *lin_addr; // manually implement CAS
//                if (res == lin_oldval) *lin_addr = lin_newval;
                XEND();
                ++threadData[tid].commitReader;
                goto committed;
            } else ++threadData[tid].abortReader;
        }
        
        // fallback path
        ++threadData[tid].fallback;
        rwlock.readLock();
        ts = timestamp;
        res = __sync_val_compare_and_swap(lin_addr, lin_oldval, lin_newval);
        rwlock.readUnlock();
        
committed:
        if (res == lin_oldval){
            set_insertion_timestamps(tid, ts, insertedNodes, deletedNodes);
            set_deletion_timestamps(tid, ts, insertedNodes, deletedNodes);
            
            if (!logicalDeletion) {
                // physical deletion will happen at the same time as logical deletion
                physical_deletion_succeeded(tid, deletedNodes);
            }
            
#if defined USE_RQ_DEBUGGING
            DEBUG_RECORD_UPDATE_CHECKSUM<K,V>(tid, ts, insertedNodes, deletedNodes, ds);
#endif
        } else {
            if (!logicalDeletion) {
                // physical deletion will happen at the same time as logical deletion
                physical_deletion_failed(tid, deletedNodes);
            }            
        }
        return res;
    }

    // invoke at the start of each traversal
    inline void traversal_start(const int tid) {
        threadData[tid].hashlist->clear();

//        // htm path
//        int limit = MAX_HTM_ATTEMPTS;
//        while (limit--) {
//            while (rwlock.isLocked()) {}
//            if (XBEGIN() == _XBEGIN_STARTED) {
//                if (rwlock.isLocked()) XABORT(1);
//                threadData[tid].rq_lin_time = ++timestamp; // linearization point of range query (at the write to timestamp)
//                XEND();
//                ++threadData[tid].commitWriter;
//                goto committed;
//            } else ++threadData[tid].abortWriter;
//        }
//committed:
        
        // fallback path
        rwlock.writeLock();
        threadData[tid].rq_lin_time = ++timestamp; // linearization point of range query (at the write to timestamp)
        rwlock.writeUnlock();
    }

private:
    // invoke each time a traversal visits a node with a key in the desired range:
    // if the node belongs in the range query, it will be placed in rqResult[index]
    inline int __traversal_try_add(const int tid, NodeType * const node, NodeType ** const nodeSource, K * const outputKeys, V * const outputValues, const K& lo, const K& hi, bool foundDuringTraversal) {

        // rqResultKeys should have space for MAX_KEYS_PER_NODE keys, AT LEAST
        
        // in the following, rather than having deeply nested if-else blocks,
        // we return asap, and list facts that must be true if we didn't return
        assert(foundDuringTraversal || !logicalDeletion || ds->isLogicallyDeleted(tid, node));
        
        long long itime = TIMESTAMP_NOT_SET;
        while (itime == TIMESTAMP_NOT_SET) { itime = node->itime; }
        if (node->itime >= threadData[tid].rq_lin_time) return 0;               // node was inserted after the range query
        // fact: node was inserted before the range query
        
        bool logicallyDeleted = (logicalDeletion && ds->isLogicallyDeleted(tid, node));
        long long dtime = TIMESTAMP_NOT_SET;

        if (!logicalDeletion && foundDuringTraversal) goto tryAddToRQ;          // no logical deletion. since node was inserted before the range query, and the traversal encountered it, it must have been deleted AFTER the traversal encountered it.
        // fact: no logical deletion ==> did not find node during traversal

        dtime = node->dtime;
        if (dtime != TIMESTAMP_NOT_SET) {
            if (dtime < threadData[tid].rq_lin_time) return 0;                  // node was deleted before the range query
            goto tryAddToRQ;
        }

        // fact: dtime was not set above
        if (logicalDeletion && !logicallyDeleted) goto tryAddToRQ;              // if logical deletion is used with marking, the fact that node was inserted before the range query, and that the traversal encountered node, is NOT enough to argue that node was in the data structure when the traversal started. why? when the traversal encountered node, it might have already been marked. so, we check if node is marked. if not, then the node has not yet been deleted.
        // fact: if there is logical deletion, then the node has now been deleted

        ///////////////////////// HANDLE UNKNOWN DTIME /////////////////////////
        // if we are executing this because node was ANNOUNCED by a process,
        // as something that MIGHT soon be deleted (if nodeSource != NULL),
        // then node might not ever actually be deleted,
        // so we can't spin forever on dtime.
        if (nodeSource != NULL) {
            while (dtime == TIMESTAMP_NOT_SET && *nodeSource == node) { dtime = node->dtime; }
            if (dtime == TIMESTAMP_NOT_SET) {
                // above loop exited because the process removed its announcement to this node!
                // if the process deleted the node, then it removed the
                // announcement AFTER setting dtime.
                // so we reread dtime one more time, to figure out whether
                // the process actually deleted the node.
                SOFTWARE_BARRIER; // prevent read of dtime from happening before last read of *nodeSource
                dtime = node->dtime;
                if (dtime == TIMESTAMP_NOT_SET) {
                    // since dtime is not set, the process did NOT delete the node.
                    // so, either a DIFFERENT process deleted it,
                    // or it was found during the data structure traversal.
                    // if another process deleted it, then we will find it
                    // either in that process' announcements, or in a limbo bag.
                    return 0;
                }
                // the node has been deleted, and dtime is set, so we check dtime below.
            }
        } else {
            while (dtime == TIMESTAMP_NOT_SET) { dtime = node->dtime; }
        }
        if (dtime < threadData[tid].rq_lin_time) return 0;                      // node was deleted before the range query
        // fact: node was inserted before the rq and deleted after it
        
        ///////////////////// TRY TO ADD NODE'S KEYS TO RQ /////////////////////
        // note: this way of organizing this decision tree favors trees with fat multi-key nodes, because getKeys is delayed as long as possible.
        
tryAddToRQ:
        // fetch the node's keys that are in the set
        int cnt = ds->getKeys(tid, node, outputKeys, outputValues);
        assert(cnt < RQ_DEBUGGING_MAX_KEYS_PER_NODE);
        if (cnt == 0) return 0;                                                 // node doesn't contain any keys that are in the set
        // TODO: properly assert that getKeys doesn't run out of bounds on outputKeys[...] (i'm quite certain it doesn't, currently, though.)
        
        // note: in the following loop, we shift keys in the outputKeys array left to eliminate any that ultimately should not be added to the range query
        int numNewKeys = 0;
        for (int i=0;i<cnt;++i) {                                               // decide whether key = outputKeys[i] should be in the range query
            if (!ds->isInRange(outputKeys[i], lo, hi)) goto doNotAddToRQ;       // key is NOT in the desired range
            if (threadData[tid].hashlist->contains(outputKeys[i])) goto doNotAddToRQ; // key is already in the range query
            outputKeys[numNewKeys] = outputKeys[i];                             // save this as a new key added to the RQ
            outputValues[numNewKeys] = outputValues[i];
            ++numNewKeys;

doNotAddToRQ: (0);
        }
        return numNewKeys;
    }
    
    inline void traversal_try_add(const int tid, NodeType * const node, NodeType ** const nodeSource, K * const rqResultKeys, V * const rqResultValues, int * const startIndex, const K& lo, const K& hi, bool foundDuringTraversal) {
//#if defined MICROBENCH && !defined NDEBUG
//        assert(*startIndex < 2*RQSIZE); // note: this assert is a hack. it should be *startIndex < size of rqResultKeys
//        if (*startIndex >= RQSIZE) {
//            cout<<"ERROR: *startIndex="<<(*startIndex)<<" is unexpectedly greater than or equal to RQSIZE="<<RQSIZE<<" (lo="<<lo<<" hi="<<hi<<")"<<endl;
//            cout<<"results:";
//            for (int i=0;i<*startIndex;++i) {
//                cout<<" "<<rqResultKeys[i];
//            }
//            cout<<endl;
//            exit(-1);
//        }
//#endif
        int numNewKeys = __traversal_try_add(tid, node, nodeSource, rqResultKeys+(*startIndex), rqResultValues+(*startIndex), lo, hi, foundDuringTraversal);
//#if defined MICROBENCH
//        assert(*startIndex + numNewKeys < 2*RQSIZE); // note: this assert is a hack. it should be *startIndex + numNewKeys < size of rqResultKeys array
//#endif
        for (int i=0;i<numNewKeys;++i) {
            threadData[tid].hashlist->insert(rqResultKeys[(*startIndex)++]);
        }
        // note: the above increments startIndex
#if defined MICROBENCH
        assert(*startIndex <= RQSIZE);
#endif
    }
    
public:
    inline void traversal_try_add(const int tid, NodeType * const node, K * const rqResultKeys, V * const rqResultValues, int * const startIndex, const K& lo, const K& hi) {
        traversal_try_add(tid, node, NULL, rqResultKeys, rqResultValues, startIndex, lo, hi, true);
    }
    
    // invoke at the end of each traversal:
    // any nodes that were deleted during the traversal,
    // and were consequently missed during the traversal,
    // are placed in rqResult[index]
    void traversal_end(const int tid, K * const rqResultKeys, V * const rqResultValues, int * const startIndex, const K& lo, const K& hi) {
        // todo: possibly optimize by skipping entire blocks if there are many keys to skip (does not seem to be justifiable for 4 work threads and 4 range query threads)

        SOFTWARE_BARRIER;
        long long end_timestamp = timestamp;
        SOFTWARE_BARRIER;
        
        // collect nodes announced by other processes
        for (int otherTid=0;otherTid<NUM_PROCESSES;++otherTid) if (otherTid != tid) {
            int sz = threadData[otherTid].numAnnouncements;
            SOFTWARE_BARRIER;
            for (int i=0;i<sz;++i) {
                NodeType * node = (NodeType *) threadData[otherTid].announcements[i];
                assert(node);
                traversal_try_add(tid, node, (NodeType **) &threadData[otherTid].announcements[i], rqResultKeys, rqResultValues, startIndex, lo, hi, false);
            }
        }
        SOFTWARE_BARRIER;
        
        // collect epoch bags of other processes (MUST be after checking announcements!)
        blockbag<NodeType> * all_bags[NUM_PROCESSES*NUMBER_OF_EPOCH_BAGS+1];
        vector<blockbag_iterator<NodeType>> all_iterators;
        int numIterators = 0;
        for (int otherTid=0;otherTid<NUM_PROCESSES;++otherTid) if (otherTid != tid) {
            blockbag<NodeType> * thread_bags[NUMBER_OF_EPOCH_BAGS+1];
            recmgr->get((NodeType *) NULL)->reclaim->getSafeBlockbags(otherTid, thread_bags);
            for (int i=0;thread_bags[i];++i) {
                all_bags[numIterators] = thread_bags[i];
                all_iterators.push_back(thread_bags[i]->begin());
                ++numIterators;
            }
        }
        
        int numSkippedInEpochBags = 0;
        int numVisitedInEpochBags = 0;
        for (int ix = 0; ix < numIterators; ++ix) {
            for (; all_iterators[ix] != all_bags[ix]->end(); all_iterators[ix]++) {
                NodeType * node = (*all_iterators[ix]);
                assert(node);

                ++numVisitedInEpochBags;
                ++numSkippedInEpochBags;

                long long dtime = node->dtime;
                if (dtime != TIMESTAMP_NOT_SET && dtime > end_timestamp) continue;

                --numSkippedInEpochBags;
                
                if (!(logicalDeletion && canRetireNodesLogicallyDeletedByOtherProcesses)) {
                    // if we cannot retire nodes that are logically deleted
                    // by other processes, then we always retire nodes in
                    // order of increasing dtime values.
                    // so, the blockbag will be ordered, which means that,
                    // if dtime is before the RQ, then all remaining nodes
                    // in this bag were deleted before the RQ.
                    // so, in this case, we skip to the next bag.
                    if (dtime != TIMESTAMP_NOT_SET && dtime < threadData[tid].rq_lin_time) break;
                }

                traversal_try_add(tid, node, NULL, rqResultKeys, rqResultValues, startIndex, lo, hi, false);
            }
        }

#if defined MICROBENCH && !defined NDEBUG
        if (*startIndex > RQSIZE) {
            cout<<"ERROR: *startIndex="<<(*startIndex)<<" is unexpectedly greater than or equal to RQSIZE="<<RQSIZE<<" (lo="<<lo<<" hi="<<hi<<")"<<endl;
            cout<<"results:";
            for (int i=0;i<*startIndex;++i) {
                cout<<" "<<rqResultKeys[i];
            }
            cout<<endl;
            exit(-1);
        }
#endif
        
#ifdef __HANDLE_STATS

        GSTATS_ADD_IX(tid, skipped_in_bags, numSkippedInEpochBags, threadData[tid].rq_lin_time);
        GSTATS_ADD_IX(tid, visited_in_bags, numVisitedInEpochBags, threadData[tid].rq_lin_time);
#endif
        DEBUG_RECORD_RQ_VISITED(tid, threadData[tid].rq_lin_time, numVisitedInEpochBags);
        DEBUG_RECORD_RQ_SIZE(*startIndex);
        DEBUG_RECORD_RQ_CHECKSUM(tid, threadData[tid].rq_lin_time, rqResultKeys, *startIndex);
    }
};

#endif	/* RQ_RWLOCK_H */


================================================
FILE: datastructures/trevor_brown_abtree/common/rq/rq_provider.h
================================================
/* 
 * File:   rq_provider.h
 * Author: trbot
 *
 * Created on May 16, 2017, 5:14 PM
 */

#ifndef RQ_PROVIDER_H
#define RQ_PROVIDER_H

#if defined RQ_LOCKFREE
    #include "rq_dcssp.h"
#elif defined RQ_RWLOCK
    #include "rq_rwlock.h"
#elif defined RQ_HTM_RWLOCK
    #include "rq_htm_rwlock.h"
#elif defined RQ_UNSAFE
    #include "rq_unsafe.h"
#elif defined RQ_SNAPCOLLECTOR
    #include "rq_snapcollector.h"
#else
    #warning "No range query method specified... using non-linearizable range queries. See rq_provider.h for other options."
    #define RQ_UNSAFE
    #include "rq_unsafe.h"
    //#error NO RQ PROVIDER DEFINED
#endif

#endif /* RQ_PROVIDER_H */


================================================
FILE: datastructures/trevor_brown_abtree/common/rq/rq_rwlock.h
================================================
/* 
 * File:   rq_rwlock.h
 * Author: trbot
 *
 * Created on April 20, 2017, 1:03 PM
 */

#ifndef RQ_RWLOCK_H
#define	RQ_RWLOCK_H

#define MAX_NODES_DELETED_ATOMICALLY 8
#define MAX_KEYS_PER_NODE 32

#include "rq_debugging.h"
#include <hashlist.h>
#include <rwlock.h>
#include <pthread.h>
#include <cassert>

// the following define enables an optimization that i'm not sure is correct.
//#define COLLECT_ANNOUNCEMENTS_FAST

template <typename K, typename V, typename NodeType, typename DataStructure, typename RecordManager, bool logicalDeletion, bool canRetireNodesLogicallyDeletedByOtherProcesses>
class RQProvider {
private:
    struct __rq_thread_data {
        #define __RQ_THREAD_DATA_SIZE 1024
        union {
            struct { // anonymous struct inside anonymous union means we don't need to type anything special to access these variables
                long long rq_lin_time;
                HashList<K> * hashlist;
                volatile char padding0[PREFETCH_SIZE_BYTES];
                void * announcements[MAX_NODES_DELETED_ATOMICALLY+1];
                int numAnnouncements;
            };
            char bytes[__RQ_THREAD_DATA_SIZE]; // avoid false sharing
        };
    } __attribute__((aligned(__RQ_THREAD_DATA_SIZE)));

    #define TIMESTAMP_NOT_SET 0
    #define HASHLIST_INIT_CAPACITY_POW2 (1<<8)

    const int NUM_PROCESSES;
    volatile char padding0[PREFETCH_SIZE_BYTES];
    volatile long long timestamp = 1;
    volatile char padding1[PREFETCH_SIZE_BYTES];
    RWLock rwlock;
    volatile char padding2[PREFETCH_SIZE_BYTES];
    __rq_thread_data * threadData;
    
    DataStructure * ds;
    RecordManager * const recmgr;

    int init[MAX_TID_POW2] = {0,};

public:
    RQProvider(const int numProcesses, DataStructure * ds, RecordManager * recmgr) : NUM_PROCESSES(numProcesses), ds(ds), recmgr(recmgr) {
        threadData = new __rq_thread_data[numProcesses];
        DEBUG_INIT_RQPROVIDER(numProcesses);
    }

    ~RQProvider() {
//        for (int tid=0;tid<NUM_PROCESSES;++tid) {
//            threadData[tid].hashlist->destroy();
//            delete threadData[tid].hashlist;
//        }
        delete[] threadData;
        DEBUG_DEINIT_RQPROVIDER(NUM_PROCESSES);
    }

    // invoke before a given thread can perform any rq_functions
    void initThread(const int tid) {
        if (init[tid]) return; else init[tid] = !init[tid];

        threadData[tid].hashlist = new HashList<K>();
        threadData[tid].hashlist->init(HASHLIST_INIT_CAPACITY_POW2);
        threadData[tid].numAnnouncements = 0;
        for (int i=0;i<MAX_NODES_DELETED_ATOMICALLY+1;++i) {
            threadData[tid].announcements[i] = NULL;
        }
        DEBUG_INIT_THREAD(tid);
    }

    // invoke once a given thread will no longer perform any rq_ functions
    void deinitThread(const int tid) {
        if (!init[tid]) return; else init[tid] = !init[tid];

        threadData[tid].hashlist->destroy();
        delete threadData[tid].hashlist;
        DEBUG_DEINIT_THREAD(tid);
    }

    // invoke whenever a new node is created/initialized
    inline void init_node(const int tid, NodeType * const node) {
        node->itime = TIMESTAMP_NOT_SET;
        node->dtime = TIMESTAMP_NOT_SET;
    }

    // for each address addr that is modified by rq_linearize_update_at_write
    // or rq_linearize_update_at_cas, you must replace any initialization of addr
    // with invocations of rq_write_addr
    template <typename T>
    inline void write_addr(const int tid, T volatile * const addr, const T val) {
        *addr = val;
    }

    // for each address addr that is modified by rq_linearize_update_at_write
    // or rq_linearize_update_at_cas, you must replace any reads of addr with
    // invocations of rq_read_addr
    template <typename T>
    inline T read_addr(const int tid, T volatile * const addr) {
        return *addr;
    }

    // IF DATA STRUCTURE PERFORMS LOGICAL DELETION
    // run some time BEFORE the physical deletion of a node
    // whose key has ALREADY been logically deleted.
    void announce_physical_deletion(const int tid, NodeType * const * const deletedNodes) {
        int i;
        for (i=0;deletedNodes[i];++i) {
            threadData[tid].announcements[threadData[tid].numAnnouncements+i] = deletedNodes[i];
        }
        SOFTWARE_BARRIER;
        threadData[tid].numAnnouncements += i;
        assert(threadData[tid].numAnnouncements <= MAX_NODES_DELETED_ATOMICALLY);
        SOFTWARE_BARRIER;
    }

    // IF DATA STRUCTURE PERFORMS LOGICAL DELETION
    // run AFTER performing announce_physical_deletion,
    // if the cas that was trying to physically delete node failed.
    void physical_deletion_failed(const int tid, NodeType * const * const deletedNodes) {
        for (int i=0;deletedNodes[i];++i) {
            --threadData[tid].numAnnouncements;
#ifdef COLLECT_ANNOUNCEMENTS_FAST
            threadData[tid].announcements[threadData[tid].numAnnouncements] = NULL;
#endif
        }
        assert(threadData[tid].numAnnouncements >= 0);
    }
    
    // IF DATA STRUCTURE PERFORMS LOGICAL DELETION
    // run AFTER performing announce_physical_deletion,
    // if the cas that was trying to physically delete node succeeded.
    void physical_deletion_succeeded(const int tid, NodeType * const * const deletedNodes) {
        int i;
        for (i=0;deletedNodes[i];++i) {
            recmgr->retire(tid, deletedNodes[i]);
        }
        SOFTWARE_BARRIER; // ensure nodes are placed in the epoch bag BEFORE they are removed from announcements.
        threadData[tid].numAnnouncements -= i;
        assert(threadData[tid].numAnnouncements >= 0);
    }
    
private:
    
    inline void set_insertion_timestamps(
            const int tid,
            const long long ts,
            NodeType * const * const insertedNodes,
            NodeType * const * const deletedNodes) {
        
        // set insertion timestamps
        // for each i_node in insertedNodes
        for (int i_nodeix=0;insertedNodes[i_nodeix];++i_nodeix) {
            insertedNodes[i_nodeix]->itime = ts;
        }
    }

    inline void set_deletion_timestamps(
            const int tid,
            const long long ts,
            NodeType * const * const insertedNodes,
            NodeType * const * const deletedNodes) {
        
        // set deletion timestamps
        // for each d_node in deletedNodes
        for (int d_nodeix=0;deletedNodes[d_nodeix];++d_nodeix) {
            deletedNodes[d_nodeix]->dtime = ts;
        }
    }
    
public:

    // replace the linearization point of an update that inserts or deletes nodes
    // with an invocation of this function if the linearization point is a WRITE
    template <typename T>
    inline T linearize_update_at_write(
            const int tid,
            T volatile * const lin_addr,
            const T& lin_newval,
            NodeType * const * const insertedNodes,
            NodeType * const * const deletedNodes) {

        if (!logicalDeletion) {
            // physical deletion will happen at the same time as logical deletion
            announce_physical_deletion(tid, deletedNodes);
        }
        
        rwlock.readLock();
        long long ts = timestamp;
        *lin_addr = lin_newval; // original linearization point
        rwlock.readUnlock();

        set_insertion_timestamps(tid, ts, insertedNodes, deletedNodes);
        set_deletion_timestamps(tid, ts, insertedNodes, deletedNodes);
        
        if (!logicalDeletion) {
            // physical deletion will happen at the same time as logical deletion
            physical_deletion_succeeded(tid, deletedNodes);
        }
        
#if defined USE_RQ_DEBUGGING
        DEBUG_RECORD_UPDATE_CHECKSUM<K,V>(tid, ts, insertedNodes, deletedNodes, ds);
#endif
        return lin_newval;
    }
    
    // replace the linearization point of an update that inserts or deletes nodes
    // with an invocation of this function if the linearization point is a CAS
    template <typename T>
    inline T linearize_update_at_cas(
            const int tid,
            T volatile * const lin_addr,
            const T& lin_oldval,
            const T& lin_newval,
            NodeType * const * const insertedNodes,
            NodeType * const * const deletedNodes) {

        if (!logicalDeletion) {
            // physical deletion will happen at the same time as logical deletion
            announce_physical_deletion(tid, deletedNodes);
        }
        
        rwlock.readLock();
        long long ts = timestamp;
        T res = __sync_val_compare_and_swap(lin_addr, lin_oldval, lin_newval);
        rwlock.readUnlock();
        
        if (res == lin_oldval){
            set_insertion_timestamps(tid, ts, insertedNodes, deletedNodes);
            set_deletion_timestamps(tid, ts, insertedNodes, deletedNodes);
            
            if (!logicalDeletion) {
                // physical deletion will happen at the same time as logical deletion
                physical_deletion_succeeded(tid, deletedNodes);
            }
            
#if defined USE_RQ_DEBUGGING
            DEBUG_RECORD_UPDATE_CHECKSUM<K,V>(tid, ts, insertedNodes, deletedNodes, ds);
#endif
        } else {
            if (!logicalDeletion) {
                // physical deletion will happen at the same time as logical deletion
                physical_deletion_failed(tid, deletedNodes);
            }            
        }
        return res;
    }

    // invoke at the start of each traversal
    inline void traversal_start(const int tid) {
        threadData[tid].hashlist->clear();
        rwlock.writeLock();
        threadData[tid].rq_lin_time = ++timestamp; // linearization point of range query (at the write to timestamp)
        rwlock.writeUnlock();
    }

private:
    // invoke each time a traversal visits a node with a key in the desired range:
    // if the node belongs in the range query, it will be placed in rqResult[index]
    inline int __traversal_try_add(const int tid, NodeType * const node, NodeType ** const nodeSource, K * const outputKeys, V * const outputValues, const K& lo, const K& hi, bool foundDuringTraversal) {
        
        // rqResultKeys should have space for MAX_KEYS_PER_NODE keys, AT LEAST
        
        // in the following, rather than having deeply nested if-else blocks,
        // we return asap, and list facts that must be true if we didn't return
        assert(foundDuringTraversal || !logicalDeletion || ds->isLogicallyDeleted(tid, node));
        
        long long itime = TIMESTAMP_NOT_SET;
        while (itime == TIMESTAMP_NOT_SET) { itime = node->itime; }
        if (node->itime >= threadData[tid].rq_lin_time) return 0;               // node was inserted after the range query
        // fact: node was inserted before the range query
        
        bool logicallyDeleted = (logicalDeletion && ds->isLogicallyDeleted(tid, node));
        long long dtime = TIMESTAMP_NOT_SET;

        if (!logicalDeletion && foundDuringTraversal) goto tryAddToRQ;          // no logical deletion. since node was inserted before the range query, and the traversal encountered it, it must have been deleted AFTER the traversal encountered it.
        // fact: no logical deletion ==> did not find node during traversal

        dtime = node->dtime;
        if (dtime != TIMESTAMP_NOT_SET) {
            if (dtime < threadData[tid].rq_lin_time) return 0;                  // node was deleted before the range query
            goto tryAddToRQ;
        }

        // fact: dtime was not set above
        if (logicalDeletion && !logicallyDeleted) goto tryAddToRQ;              // if logical deletion is used with marking, the fact that node was inserted before the range query, and that the traversal encountered node, is NOT enough to argue that node was in the data structure when the traversal started. why? when the traversal encountered node, it might have already been marked. so, we check if node is marked. if not, then the node has not yet been deleted.
        // fact: if there is logical deletion, then the node has now been deleted

        ///////////////////////// HANDLE UNKNOWN DTIME /////////////////////////
        // if we are executing this because node was ANNOUNCED by a process,
        // as something that MIGHT soon be deleted (if nodeSource != NULL),
        // then node might not ever actually be deleted,
        // so we can't spin forever on dtime.
        if (nodeSource != NULL) {
            while (dtime == TIMESTAMP_NOT_SET && *nodeSource == node) { dtime = node->dtime; }
            if (dtime == TIMESTAMP_NOT_SET) {
                // above loop exited because the process removed its announcement to this node!
                // if the process deleted the node, then it removed the
                // announcement AFTER setting dtime.
                // so we reread dtime one more time, to figure out whether
                // the process actually deleted the node.
                SOFTWARE_BARRIER; // prevent read of dtime from happening before last read of *nodeSource
                dtime = node->dtime;
                if (dtime == TIMESTAMP_NOT_SET) {
                    // since dtime is not set, the process did NOT delete the node.
                    // so, either a DIFFERENT process deleted it,
                    // or it was found during the data structure traversal.
                    // if another process deleted it, then we will find it
                    // either in that process' announcements, or in a limbo bag.
                    return 0;
                }
                // the node has been deleted, and dtime is set, so we check dtime below.
            }
        } else {
            while (dtime == TIMESTAMP_NOT_SET) { dtime = node->dtime; }
        }
        if (dtime < threadData[tid].rq_lin_time) return 0;                      // node was deleted before the range query
        // fact: node was inserted before the rq and deleted after it
        
        ///////////////////// TRY TO ADD NODE'S KEYS TO RQ /////////////////////
        // note: this way of organizing this decision tree favors trees with fat multi-key nodes, because getKeys is delayed as long as possible.
        
tryAddToRQ:
        // fetch the node's keys that are in the set
        int cnt = ds->getKeys(tid, node, outputKeys, outputValues);
        assert(cnt < RQ_DEBUGGING_MAX_KEYS_PER_NODE);
        if (cnt == 0) return 0;                                                 // node doesn't contain any keys that are in the set
        // TODO: properly assert that getKeys doesn't run out of bounds on outputKeys[...] (i'm quite certain it doesn't, currently, though.)
        
        // note: in the following loop, we shift keys in the outputKeys array left to eliminate any that ultimately should not be added to the range query
        int numNewKeys = 0;
        for (int i=0;i<cnt;++i) {                                               // decide whether key = outputKeys[i] should be in the range query
            if (!ds->isInRange(outputKeys[i], lo, hi)) goto doNotAddToRQ;       // key is NOT in the desired range
            if (threadData[tid].hashlist->contains(outputKeys[i])) goto doNotAddToRQ; // key is already in the range query
            outputKeys[numNewKeys] = outputKeys[i];                             // save this as a new key added to the RQ
            outputValues[numNewKeys] = outputValues[i];
            ++numNewKeys;

doNotAddToRQ: (0);
        }
        return numNewKeys;
    }
    
    inline void traversal_try_add(const int tid, NodeType * const node, NodeType ** const nodeSource, K * const rqResultKeys, V * const rqResultValues, int * const startIndex, const K& lo, const K& hi, bool foundDuringTraversal) {
//#if defined MICROBENCH && !defined NDEBUG
//        assert(*startIndex < 2*RQSIZE); // note: this assert is a hack. it should be *startIndex < size of rqResultKeys
//        if (*startIndex >= RQSIZE) {
//            cout<<"ERROR: *startIndex="<<(*startIndex)<<" is unexpectedly greater than or equal to RQSIZE="<<RQSIZE<<" (lo="<<lo<<" hi="<<hi<<")"<<endl;
//            cout<<"results:";
//            for (int i=0;i<*startIndex;++i) {
//                cout<<" "<<rqResultKeys[i];
//            }
//            cout<<endl;
//            exit(-1);
//        }
//#endif
        int numNewKeys = __traversal_try_add(tid, node, nodeSource, rqResultKeys+(*startIndex), rqResultValues+(*startIndex), lo, hi, foundDuringTraversal);
//#if defined MICROBENCH
//        assert(*startIndex + numNewKeys < 2*RQSIZE); // note: this assert is a hack. it should be *startIndex + numNewKeys < size of rqResultKeys array
//#endif
        for (int i=0;i<numNewKeys;++i) {
            threadData[tid].hashlist->insert(rqResultKeys[(*startIndex)++]);
        }
        // note: the above increments startIndex
#if defined MICROBENCH
        assert(*startIndex <= RQSIZE);
#endif
    }

public:
    inline void traversal_try_add(const int tid, NodeType * const node, K * const rqResultKeys, V * const rqResultValues, int * const startIndex, const K& lo, const K& hi) {
        traversal_try_add(tid, node, NULL, rqResultKeys, rqResultValues, startIndex, lo, hi, true);
    }
    
    // invoke at the end of each traversal:
    // any nodes that were deleted during the traversal,
    // and were consequently missed during the traversal,
    // are placed in rqResult[index]
    void traversal_end(const int tid, K * const rqResultKeys, V * const rqResultValues, int * const startIndex, const K& lo, const K& hi) {
        // todo: possibly optimize by skipping entire blocks if there are many keys to skip (does not seem to be justifiable for 4 work threads and 4 range query threads)

        SOFTWARE_BARRIER;
        long long end_timestamp = timestamp;
        SOFTWARE_BARRIER;
        
        // collect nodes announced by other processes
#ifdef COLLECT_ANNOUNCEMENTS_FAST
        int numCollected = 0;
        NodeType * collectedAnnouncement[NUM_PROCESSES*MAX_NODES_DELETED_ATOMICALLY];
        NodeType ** announcementSource[NUM_PROCESSES*MAX_NODES_DELETED_ATOMICALLY];
#endif
        for (int otherTid=0;otherTid<NUM_PROCESSES;++otherTid) if (otherTid != tid) {
            int sz = threadData[otherTid].numAnnouncements;
            SOFTWARE_BARRIER;
            for (int i=0;i<sz;++i) {
                NodeType * node = (NodeType *) threadData[otherTid].announcements[i];
                assert(node);
#ifdef COLLECT_ANNOUNCEMENTS_FAST
                collectedAnnouncement[numCollected] = node;
                announcementSource[numCollected] = (NodeType **) &threadData[otherTid].announcements[i];
                ++numCollected;
#else
                traversal_try_add(tid, node, (NodeType **) &threadData[otherTid].announcements[i], rqResultKeys, rqResultValues, startIndex, lo, hi, false);
#endif
            }
        }
        SOFTWARE_BARRIER;
        
        // TODO: add ability to bail out of waiting for dtime if node is no longer announced! (needed when COLLECT_ANNOUNCEMENTS_FAST is NOT defined)
        
        // collect epoch bags of other processes (MUST be after collecting announcements!)
        blockbag<NodeType> * all_bags[NUM_PROCESSES*NUMBER_OF_EPOCH_BAGS+1];
        vector<blockbag_iterator<NodeType>> all_iterators;
        int numIterators = 0;
        for (int otherTid=0;otherTid<NUM_PROCESSES;++otherTid) if (otherTid != tid) {
            blockbag<NodeType> * thread_bags[NUMBER_OF_EPOCH_BAGS+1];
            recmgr->get((NodeType *) NULL)->reclaim->getSafeBlockbags(otherTid, thread_bags);
            for (int i=0;thread_bags[i];++i) {
                all_bags[numIterators] = thread_bags[i];
                all_iterators.push_back(thread_bags[i]->begin());
                ++numIterators;
            }
        }
        
#ifdef COLLECT_ANNOUNCEMENTS_FAST
        // try to add nodes collected from process announcements to the RQ
        for (int i=0;i<numCollected;++i) {
            traversal_try_add(tid, collectedAnnouncement[i], announcementSource[i], rqResultKeys, rqResultValues, startIndex, lo, hi, false);
        }
#endif
        
        int numSkippedInEpochBags = 0;
        int numVisitedInEpochBags = 0;
        for (int ix = 0; ix < numIterators; ++ix) {
            for (; all_iterators[ix] != all_bags[ix]->end(); all_iterators[ix]++) {
                NodeType * node = (*all_iterators[ix]);
                assert(node);

                ++numVisitedInEpochBags;
                ++numSkippedInEpochBags;

                long long dtime = node->dtime;
                if (dtime != TIMESTAMP_NOT_SET && dtime > end_timestamp) continue;

                --numSkippedInEpochBags;

                if (!(logicalDeletion && canRetireNodesLogicallyDeletedByOtherProcesses)) {
                    // if we cannot retire nodes that are logically deleted
                    // by other processes, then we always retire nodes in
                    // order of increasing dtime values.
                    // so, the blockbag will be ordered, which means that,
                    // if dtime is before the RQ, then all remaining nodes
                    // in this bag were deleted before the RQ.
                    // so, in this case, we skip to the next bag.
                    if (dtime != TIMESTAMP_NOT_SET && dtime < threadData[tid].rq_lin_time) break;
                }

                traversal_try_add(tid, node, NULL, rqResultKeys, rqResultValues, startIndex, lo, hi, false);
            }
        }

#if defined MICROBENCH && !defined NDEBUG
        if (*startIndex > RQSIZE) {
            cout<<"ERROR: *startIndex="<<(*startIndex)<<" is unexpectedly greater than or equal to RQSIZE="<<RQSIZE<<" (lo="<<lo<<" hi="<<hi<<")"<<endl;
            cout<<"results:";
            for (int i=0;i<*startIndex;++i) {
                cout<<" "<<rqResultKeys[i];
            }
            cout<<endl;
            exit(-1);
        }
#endif
        
#ifdef __HANDLE_STATS
        GSTATS_ADD_IX(tid, skipped_in_bags, numSkippedInEpochBags, threadData[tid].rq_lin_time);
        GSTATS_ADD_IX(tid, visited_in_bags, numVisitedInEpochBags, threadData[tid].rq_lin_time);
#endif
        DEBUG_RECORD_RQ_VISITED(tid, threadData[tid].rq_lin_time, numVisitedInEpochBags);
        DEBUG_RECORD_RQ_SIZE(*startIndex);
        DEBUG_RECORD_RQ_CHECKSUM(tid, threadData[tid].rq_lin_time, rqResultKeys, *startIndex);
    }
};

#endif	/* RQ_RWLOCK_H */


================================================
FILE: datastructures/trevor_brown_abtree/common/rq/rq_snapcollector.h
================================================
/* 
 * File:   rq_rwlock.h
 * Author: trbot
 *
 * Created on April 20, 2017, 1:03 PM
 * 
 * Implementation of Shahar Timnat's Iterator algorithm.
 * 
 * WARNING:
 * 1. Shahar's algorithm ONLY supports data structures with logical deletion.
 * 2. It only supports data structures where each node contains ONE key.
 * 3. It only supports taking a snapshot of the entire data structure.
 * 4. It only supports insertion, deletion and search operations.
 *    If a data structure has other operations, it might not be linearizable.
 */

#ifndef RQ_RWLOCK_H
#define	RQ_RWLOCK_H

#define MAX_NODES_DELETED_ATOMICALLY 1
#define MAX_KEYS_PER_NODE 1

#include "errors.h"
#include "rq_debugging.h"
#include <record_manager.h>
#include <pthread.h>
#include <cassert>
#include "snapcollector.h"

template <typename K, typename V, typename NodeType, typename DataStructure, typename RecordManager, bool logicalDeletion, bool canRetireNodesLogicallyDeletedByOtherProcesses>
class RQProvider {
private:
    struct __rq_thread_data {
        #define __RQ_THREAD_DATA_SIZE 1024
        union {
            struct { // anonymous struct inside anonymous union means we don't need to type anything special to access these variables
                long long rq_lin_time;
                SnapCollector<NodeType,K> * currentSnapCollector;
                SnapCollector<NodeType,K> * snapCollectorToRetire;
            };
            char bytes[__RQ_THREAD_DATA_SIZE]; // avoid false sharing
        };
    } __attribute__((aligned(__RQ_THREAD_DATA_SIZE)));

    const int NUM_PROCESSES;
    volatile long long timestamp = 1;
    pthread_rwlock_t rwlock;
    __rq_thread_data * threadData;
    
    DataStructure * const ds;
    RecordManager * const recmgr;
    volatile char padding[PREFETCH_SIZE_BYTES];
    SnapCollector<NodeType,K> * volatile snapPointer;
    
    int init[MAX_TID_POW2] = {0,};

public:
    RQProvider(const int numProcesses, DataStructure * ds, RecordManager * recmgr) : NUM_PROCESSES(numProcesses), ds(ds), recmgr(recmgr) {
        assert(logicalDeletion); // Timnat's iterator algorithm REQUIRES logical deletion!
        if (pthread_rwlock_init(&rwlock, NULL)) error("could not init rwlock");
        threadData = new __rq_thread_data[numProcesses];
        
        const int dummyTid = 0;
        
        recmgr->initThread(dummyTid); // must initialize record manager before allocating!!
        initThread(dummyTid);
        
        // initialize dummy snap collector
        snapPointer = recmgr->template allocate<SnapCollector<NodeType,K> >(dummyTid);
#ifdef __HANDLE_STATS
        GSTATS_APPEND(dummyTid, extra_type1_allocated_addresses, ((long long) snapPointer)%(1<<12));
#endif
        snapPointer->init(dummyTid, numProcesses, recmgr, ds->KEY_MIN, ds->KEY_MAX+1);
        snapPointer->BlockFurtherPointers(dummyTid, recmgr);
        snapPointer->Deactivate(NULL, NULL, NULL);
        snapPointer->BlockFurtherReports();
        
        DEBUG_INIT_RQPROVIDER(numProcesses);
    }

    ~RQProvider() {
        if (pthread_rwlock_destroy(&rwlock)) error("could not destroy rwlock");
        delete[] threadData;
        snapPointer->retire(0 /* dummy tid */, recmgr);
        DEBUG_DEINIT_RQPROVIDER(NUM_PROCESSES);
    }

    // invoke before a given thread can perform any rq_functions
    void initThread(const int tid) {
        if (init[tid]) return; else init[tid] = !init[tid];

        threadData[tid].rq_lin_time = 0;
        threadData[tid].currentSnapCollector = NULL;
        threadData[tid].snapCollectorToRetire = NULL;
        DEBUG_INIT_THREAD(tid);
    }

    // invoke once a given thread will no longer perform any rq_ functions
    void deinitThread(const int tid) {
        if (!init[tid]) return; else init[tid] = !init[tid];

        DEBUG_DEINIT_THREAD(tid);
    }

    // invoke whenever a new node is created/initialized
    inline void init_node(const int tid, NodeType * const node) {}

    // for each address addr that is modified by rq_linearize_update_at_write
    // or rq_linearize_update_at_cas, you must replace any initialization of addr
    // with invocations of rq_write_addr
    template <typename T>
    inline void write_addr(const int tid, T volatile * const addr, const T val) {
        *addr = val;
    }

    // for each address addr that is modified by rq_linearize_update_at_write
    // or rq_linearize_update_at_cas, you must replace any reads of addr with
    // invocations of rq_read_addr
    template <typename T>
    inline T read_addr(const int tid, T volatile * const addr) {
        return *addr;
    }

    /**
     * Added function only for Timnat's SnapCollector.
     * This must be invoked just before the return statement of every search.
     */
    inline void search_report_target_key(const int tid, const K key, NodeType * const node) {
        SnapCollector<NodeType,K> * sc = snapPointer;
        if (sc->IsActive()) {
            ReportType type = ds->isLogicallyDeleted(tid, node) ? ReportType::Remove : ReportType::Add;
            sc->Report(tid, node, type, key, recmgr);
        }
        SOFTWARE_BARRIER;
    }
    
    /**
     * Added function only for Timnat's SnapCollector.
     * This must be invoked just before the return statement of every insertion
     *      that does not modify the data structure.
     */
    inline void insert_readonly_report_target_key(const int tid, NodeType * const node) {
        SnapCollector<NodeType,K> * sc = snapPointer;
        if (sc->IsActive()) {
            if (!ds->isLogicallyDeleted(tid, node)) {
                sc->Report(tid, node, ReportType::Add, node->key, recmgr);
            }
        }
        SOFTWARE_BARRIER;
    }
    
    /**
     * Added function only for Timnat's SnapCollector.
     * This can be invoked to determine if the current SnapCollector is active.
     */
    inline bool traversal_is_active(const int tid) {
        return threadData[tid].currentSnapCollector->IsActive();
    }
    
private:
    inline void delete_report_target_key(const int tid, NodeType * const node) {
        if (node) {
            SnapCollector<NodeType,K> * sc = snapPointer;
            if (sc->IsActive()) {
                sc->Report(tid, node, ReportType::Remove, node->key, recmgr);
            }
            SOFTWARE_BARRIER;
        }
    }
    
public:
    
    // IF DATA STRUCTURE PERFORMS LOGICAL DELETION
    // run some time BEFORE the physical deletion of a node
    // whose key has ALREADY been logically deleted.
    inline void announce_physical_deletion(const int tid, NodeType * const * const deletedNodes) {
        assert(!deletedNodes[0] || !deletedNodes[1]);
        delete_report_target_key(tid, deletedNodes[0]);
    }

    // IF DATA STRUCTURE PERFORMS LOGICAL DELETION
    // run AFTER performing announce_physical_deletion,
    // if the cas that was trying to physically delete node failed.
    inline void physical_deletion_failed(const int tid, NodeType * const * const deletedNodes) {}
    
    // IF DATA STRUCTURE PERFORMS LOGICAL DELETION
    // run AFTER performing announce_physical_deletion,
    // if the cas that was trying to physically delete node succeeded.
    inline void physical_deletion_succeeded(const int tid, NodeType * const * const deletedNodes) {
        int i;
        for (i=0;deletedNodes[i];++i) {
            recmgr->retire(tid, deletedNodes[i]);
        }
    }
    
    // replace the linearization point of an update that inserts or deletes nodes
    // with an invocation of this function if the linearization point is a WRITE
    template <typename T>
    inline T linearize_update_at_write(
            const int tid,
            T volatile * const lin_addr,
            const T& lin_newval,
            NodeType * const * const insertedNodes,
            NodeType * const * const deletedNodes) {
        
        assert((insertedNodes[0] && !deletedNodes[0])
                || (!insertedNodes[0] && deletedNodes[0]));

#ifdef RQ_USE_TIMESTAMPS
        if (pthread_rwlock_rdlock(&rwlock)) error("could not read-lock rwlock");
        long long ts = timestamp;
#else
        long long ts = 1;
#endif

        *lin_addr = lin_newval; // original linearization point
#ifdef RQ_USE_TIMESTAMPS
        if (pthread_rwlock_unlock(&rwlock)) error("could not read-unlock rwlock");
#endif
        
        if (insertedNodes[0]) insert_readonly_report_target_key(tid, insertedNodes[0]);
        if (deletedNodes[0]) delete_report_target_key(tid, deletedNodes[0]);

#if defined USE_RQ_DEBUGGING
        DEBUG_RECORD_UPDATE_CHECKSUM<K,V>(tid, ts, insertedNodes, deletedNodes, ds);
#endif
        return lin_newval;
    }
    
    // replace the linearization point of an update that inserts or deletes nodes
    // with an invocation of this function if the linearization point is a CAS
    template <typename T>
    inline T linearize_update_at_cas(
            const int tid,
            T volatile * const lin_addr,
            const T& lin_oldval,
            const T& lin_newval,
            NodeType * const * const insertedNodes,
            NodeType * const * const deletedNodes) {

        assert((insertedNodes[0] && !deletedNodes[0])
                || (!insertedNodes[0] && deletedNodes[0]));

#ifdef RQ_USE_TIMESTAMPS
        if (pthread_rwlock_rdlock(&rwlock)) error("could not read-lock rwlock");
        long long ts = timestamp;
#else
        long long ts = 1;
#endif
        
        T res = __sync_val_compare_and_swap(lin_addr, lin_oldval, lin_newval);
#ifdef RQ_USE_TIMESTAMPS
        if (pthread_rwlock_unlock(&rwlock)) error("could not read-unlock rwlock");
#endif
        
        if (res == lin_oldval){
            if (insertedNodes[0]) insert_readonly_report_target_key(tid, insertedNodes[0]);
            if (deletedNodes[0]) delete_report_target_key(tid, deletedNodes[0]);

#if defined USE_RQ_DEBUGGING
            DEBUG_RECORD_UPDATE_CHECKSUM<K,V>(tid, ts, insertedNodes, deletedNodes, ds);
#endif
        }
        return res;
    }

    // invoke at the start of each traversal
    inline void traversal_start(const int tid) {
#if !defined(RQ_USE_TIMESTAMPS)
        threadData[tid].rq_lin_time = 1;
#endif        

        threadData[tid].currentSnapCollector = snapPointer;
        SOFTWARE_BARRIER;
        if (!threadData[tid].currentSnapCollector->IsActive()) {
            SnapCollector<NodeType,K> * candidate = recmgr->template allocate<SnapCollector<NodeType,K> >(tid);
#ifdef __HANDLE_STATS
            GSTATS_APPEND(tid, extra_type1_allocated_addresses, ((long long) candidate)%(1<<12));
#endif
            candidate->init(tid, NUM_PROCESSES, recmgr, ds->KEY_MIN, ds->KEY_MAX+1);
            if (__sync_bool_compare_and_swap(&snapPointer, threadData[tid].currentSnapCollector, candidate)) {
                // delay retiring until later, because we've started accepting reports,
                // and we don't want to waste time while we are accepting reports,
                // because we don't want to receive many reports...
                threadData[tid].snapCollectorToRetire = threadData[tid].currentSnapCollector;
                threadData[tid].currentSnapCollector = candidate;
            } else {
                candidate->retire(tid, recmgr);
                threadData[tid].currentSnapCollector = snapPointer;
            }
        }
//        usleep(200000);
    }

    inline NodeType * traversal_try_add(const int tid, NodeType * const node, K * const rqResultKeys, V * const rqResultValues, int * const startIndex, const K& lo, const K& hi) {
        SnapCollector<NodeType,K> * sc = threadData[tid].currentSnapCollector;
        return sc->AddNode(tid, node, node->key, recmgr);
    }
    
    // invoke at the end of each traversal:
    // any nodes that were deleted during the traversal,
    // and were consequently missed during the traversal,
    // are placed in rqResult[index]
    void traversal_end(const int tid, K * const rqResultKeys, V * const rqResultValues, int * const startIndex, const K& lo, const K& hi) {
        SnapCollector<NodeType,K> * sc = threadData[tid].currentSnapCollector;

        sc->BlockFurtherPointers(tid, recmgr);
        SOFTWARE_BARRIER;
        sc->Deactivate(NULL, NULL, NULL);
        sc->BlockFurtherReports();
        SOFTWARE_BARRIER;

        sc->Prepare(tid, recmgr);
        
        NodeType * curr = NULL;
        while ((curr = sc->GetNext(tid))) {
            if (curr->key < lo) continue;
            if (curr->key > hi) break;
            rqResultKeys[*startIndex] = curr->key;
            rqResultValues[*startIndex] = curr->val;
            ++*startIndex;
        }
#if defined MICROBENCH
        assert(*startIndex <= RQSIZE);
#endif
        
#ifdef SNAPCOLLECTOR_PRINT_RQS
//        for (int i=0;i<*startIndex;++i) {
//            cout<<" "<<rqResultKeys[i];
//        }
//        cout<<endl;
#endif
        
        DEBUG_RECORD_RQ_SIZE(*startIndex);
        DEBUG_RECORD_RQ_CHECKSUM(tid, threadData[tid].rq_lin_time, rqResultKeys, *startIndex);

        // retire any snap collector that we replaced in this RQ
        if (threadData[tid].snapCollectorToRetire) {
            threadData[tid].snapCollectorToRetire->retire(tid, recmgr);
            threadData[tid].snapCollectorToRetire = NULL;
        }
    }
};

#endif	/* RQ_RWLOCK_H */


================================================
FILE: datastructures/trevor_brown_abtree/common/rq/rq_unsafe.h
================================================
/* 
 * File:   rq_unsafe.h
 * Author: trbot
 *
 * Created on May 15, 2017, 5:06 PM
 */

#ifndef RQ_UNSAFE_H
#define	RQ_UNSAFE_H

#include "rq_debugging.h"
#include <rwlock.h>
#include <pthread.h>

#ifndef casword_t
#define casword_t uintptr_t
#endif

template <typename K, typename V, typename NodeType, typename DataStructure, typename RecordManager, bool logicalDeletion, bool canRetireNodesLogicallyDeletedByOtherProcesses>
class RQProvider {
private:
    struct __rq_thread_data {
        #define __RQ_THREAD_DATA_SIZE 1024
        union {
            struct { // anonymous struct inside anonymous union means we don't need to type anything special to access these variables
                long long rq_lin_time;
            };
            char bytes[__RQ_THREAD_DATA_SIZE]; // avoid false sharing
        };
    } __attribute__((aligned(__RQ_THREAD_DATA_SIZE)));

    #define TIMESTAMP_NOT_SET 0
    
    const int NUM_PROCESSES;
    volatile char padding0[PREFETCH_SIZE_BYTES];
    volatile long long timestamp = 1;
    volatile char padding1[PREFETCH_SIZE_BYTES];
    RWLock rwlock;
    volatile char padding2[PREFETCH_SIZE_BYTES];
    __rq_thread_data * threadData;
    
    DataStructure * ds;
    RecordManager * const recmgr;

    int init[MAX_TID_POW2] = {0,};

public:
    RQProvider(const int numProcesses, DataStructure * ds, RecordManager * recmgr) : NUM_PROCESSES(numProcesses), ds(ds), recmgr(recmgr) {
        threadData = new __rq_thread_data[numProcesses];
        DEBUG_INIT_RQPROVIDER(numProcesses);
    }

    ~RQProvider() {
        delete[] threadData;
        DEBUG_DEINIT_RQPROVIDER(NUM_PROCESSES);
    }

    // invoke before a given thread can perform any rq_functions
    void initThread(const int tid) {
        if (init[tid]) return; else init[tid] = !init[tid];

        DEBUG_INIT_THREAD(tid);
    }

    // invoke once a given thread will no longer perform any rq_ functions
    void deinitThread(const int tid) {
        if (!init[tid]) return; else init[tid] = !init[tid];

        DEBUG_DEINIT_THREAD(tid);
    }

    // invoke whenever a new node is created/initialized
    inline void init_node(const int tid, NodeType * const node) {}

    // for each address addr that is modified by rq_linearize_update_at_write
    // or rq_linearize_update_at_cas, you must replace any initialization of addr
    // with invocations of rq_write_addr
    template <typename T>
    inline void write_addr(const int tid, T volatile * const addr, const T val) {
        *addr = val;
    }

    // for each address addr that is modified by rq_linearize_update_at_write
    // or rq_linearize_update_at_cas, you must replace any reads of addr with
    // invocations of rq_read_addr
    template <typename T>
    inline T read_addr(const int tid, T volatile * const addr) {
        return *addr;
    }
    
    // IF DATA STRUCTURE PERFORMS LOGICAL DELETION
    // run some time BEFORE the physical deletion of a node
    // whose key has ALREADY been logically deleted.
    inline void announce_physical_deletion(const int tid, NodeType * const * const deletedNodes) {}

    // IF DATA STRUCTURE PERFORMS LOGICAL DELETION
    // run AFTER performing announce_physical_deletion,
    // if the cas that was trying to physically delete node failed.
    inline void physical_deletion_failed(const int tid, NodeType * const * const deletedNodes) {}
    
    // IF DATA STRUCTURE PERFORMS LOGICAL DELETION
    // run AFTER performing announce_physical_deletion,
    // if the cas that was trying to physically delete node succeeded.
    inline void physical_deletion_succeeded(const int tid, NodeType * const * const deletedNodes) {
        int i;
        for (i=0;deletedNodes[i];++i) {
            recmgr->retire(tid, deletedNodes[i]);
        }
    }

    // replace the linearization point of an update that inserts or deletes nodes
    // with an invocation of this function if the linearization point is a WRITE
    template <typename T>
    inline T linearize_update_at_write(
            const int tid,
            T volatile * const lin_addr,
            const T& lin_newval,
            NodeType * const * const insertedNodes,
            NodeType * const * const deletedNodes) {

        if (!logicalDeletion) {
            // physical deletion will happen at the same time as logical deletion
            announce_physical_deletion(tid, deletedNodes);
        }
        
#ifdef RQ_USE_TIMESTAMPS
        rwlock.readLock();
        long long ts = timestamp;
#else
        long long ts = 1;
#endif

        *lin_addr = lin_newval; // original linearization point
#ifdef RQ_USE_TIMESTAMPS
        rwlock.readUnlock();
#endif

        if (!logicalDeletion) {
            // physical deletion will happen at the same time as logical deletion
            physical_deletion_succeeded(tid, deletedNodes);
        }
        
#if defined USE_RQ_DEBUGGING
        DEBUG_RECORD_UPDATE_CHECKSUM<K,V>(tid, ts, insertedNodes, deletedNodes, ds);
#endif
        return lin_newval;
    }

    // replace the linearization point of an update that inserts or deletes nodes
    // with an invocation of this function if the linearization point is a CAS
    template <typename T>
    inline T linearize_update_at_cas(
            const int tid,
            T volatile * const lin_addr,
            const T& lin_oldval,
            const T& lin_newval,
            NodeType * const * const insertedNodes,
            NodeType * const * const deletedNodes) {

        if (!logicalDeletion) {
            // physical deletion will happen at the same time as logical deletion
            announce_physical_deletion(tid, deletedNodes);
        }
        
#ifdef RQ_USE_TIMESTAMPS
        rwlock.readLock();
        long long ts = timestamp;
#else
        long long ts = 1;
#endif
        T res = __sync_val_compare_and_swap(lin_addr, lin_oldval, lin_newval); // original linearization point
#ifdef RQ_USE_TIMESTAMPS
        rwlock.readUnlock();
#endif

        if (res == lin_oldval) {
            if (!logicalDeletion) {
                // physical deletion will happen at the same time as logical deletion
                physical_deletion_succeeded(tid, deletedNodes);
            }

#if defined USE_RQ_DEBUGGING
            DEBUG_RECORD_UPDATE_CHECKSUM<K,V>(tid, ts, insertedNodes, deletedNodes, ds);
#endif
        } else {
            if (!logicalDeletion) {
                // physical deletion will happen at the same time as logical deletion
                physical_deletion_failed(tid, deletedNodes);
            }
        }
        
        return res;
    }

    // invoke at the start of each traversal
    inline void traversal_start(const int tid) {
#ifdef RQ_USE_TIMESTAMPS
        rwlock.writeLock();
        threadData[tid].rq_lin_time = ++timestamp; // linearization point of range query (at the write to timestamp)
        rwlock.writeUnlock();
#endif
    }

    // invoke each time a traversal visits a node with a key in the desired range:
    // if the node belongs in the range query, it will be placed in rqResult[index]
    inline void traversal_try_add(const int tid, NodeType * const node, K * const rqResultKeys, V * const rqResultValues, int * const startIndex, const K& lo, const K& hi) {
        int start = (*startIndex);
        int keysInNode = ds->getKeys(tid, node, rqResultKeys+start, rqResultValues+start);
        assert(keysInNode < RQ_DEBUGGING_MAX_KEYS_PER_NODE);
        if (keysInNode == 0) return;
        int location = start; 
        for (int i=start;i<keysInNode+start;++i) {
            if (ds->isInRange(rqResultKeys[i], lo, hi)){
                rqResultKeys[location] = rqResultKeys[i];
                rqResultValues[location] = rqResultValues[i];
                ++location;
            }   
        }
        *startIndex = location;
#if defined MICROBENCH
        assert(*startIndex <= RQSIZE);
#endif
    }

    // invoke at the end of each traversal:
    // any nodes that were deleted during the traversal,
    // and were consequently missed during the traversal,
    // are placed in rqResult[index]
    inline void traversal_end(const int tid, K * const rqResultKeys, V * const rqResultValues, int * const startIndex, const K& lo, const K& hi) {
        DEBUG_RECORD_RQ_SIZE(*startIndex);
        DEBUG_RECORD_RQ_CHECKSUM(tid, threadData[tid].rq_lin_time, rqResultKeys, *startIndex);
    }
};

#endif	/* RQ_UNSAFE_H */


================================================
FILE: datastructures/trevor_brown_abtree/common/rq/snapcollector/reportitem.h
================================================
/* 
 * File:   reportitem.h
 * Author: trbot
 *
 * Created on June 21, 2017, 4:47 PM
 */

#ifndef REPORTITEM_H
#define REPORTITEM_H

enum ReportType {Add, Remove};

static int getOrdinalForReportType(ReportType t) {
    return (t == ReportType::Add);
}

class ReportItem {
public:
    void * node;
    ReportType t;
    ReportItem * volatile next;
    int key;
    int id;
    
    ReportItem() {}
    void init(void * node, ReportType t, int key) {
        this->node = node;
        this->t = t;
        next = NULL;
        this->key = key;
        id = 0;
    }
};

class CompactReportItem {
public:
    void * node;
    ReportType t;
    int key;
    int id;
    
    CompactReportItem() {}
    void init(void * node, ReportType t, int key) {
        this->node = node;
        this->t = t;
        this->key = key;
        id = 0;
    }
};

struct {
    bool operator()(CompactReportItem * a, CompactReportItem * b) const {
        if (a->key != b->key) return a->key < b->key;
        if (a->node != b->node) return (long long) a->node < (long long) b->node;
        return getOrdinalForReportType(a->t) < getOrdinalForReportType(b->t);
    }
} compareCRI;

#endif /* REPORTITEM_H */


================================================
FILE: datastructures/trevor_brown_abtree/common/rq/snapcollector/snapcollector.h
================================================
/* 
 * File:   snapcollector.h
 * Author: trbot
 *
 * Created on June 21, 2017, 4:57 PM
 */

#ifndef SNAPCOLLECTOR_H
#define SNAPCOLLECTOR_H

#include <limits>
#include <vector>
#include <algorithm>
#include "reportitem.h"
#include <plaf.h>

template <typename NodeType, typename K>
class SnapCollector {
public:
    int NUM_THREADS;
    
    class NodeWrapper {
    public:
        NodeType * node;
        NodeWrapper * volatile next;
        K key;
        
        NodeWrapper() {}
        void init(K key) {
            this->node = NULL;
            this->next = NULL;
            this->key = key;
        }
        void init(NodeType * node, K key) {
            this->node = node;
            this->next = NULL;
            this->key = key;
        }
    };
    
private:
    ReportItem * volatile * reportHeads;
    ReportItem * volatile * reportTails;
    
    NodeWrapper * volatile head;
    NodeWrapper * volatile tail;
    ReportItem * blocker;
    volatile bool active;
    
    // variables used for aggregating reports after they are collected
    void ** currLocations;
    int * currRepLocations;
    std::vector<CompactReportItem *> * volatile gAllReports;
    
    K KEY_MAX;
    K KEY_MIN;

private:
    
    inline bool isBlocker(NodeWrapper const * const wrapper) {
        if (wrapper) {
            K key = wrapper->key;
            NodeType * node = wrapper->node;
            K key2 = KEY_MAX;
            return (key == key2 && node == NULL);
        }
        return false;
    }
    
    template <typename RecordManager>
    inline void __retireAllReports(const int tid, std::vector<CompactReportItem *> * v, RecordManager * recmgr) {
        if (v == NULL) return;
        for (auto it = v->begin(); it != v->end(); it++) {
            // retire compact report items
            recmgr->retire(tid, *it);
        }
    }
    
    template <typename RecordManager>
    inline void __deallocateAllReports(const int tid, std::vector<CompactReportItem *> * v, RecordManager * recmgr) {
        if (v == NULL) return;
        for (auto it = v->begin(); it != v->end(); it++) {
            // deallocate compact report items
            recmgr->deallocate(tid, *it);
        }
        delete v;
    }
    
public:
    
    template <typename RecordManager>
    void init(const int tid, const int numProcesses, RecordManager * const recmgr, const K _KEY_MIN, const K _KEY_MAX) {
        this->KEY_MIN = _KEY_MIN;
        this->KEY_MAX = _KEY_MAX;
        
        this->NUM_THREADS = numProcesses;
        this->reportHeads = new ReportItem * volatile [NUM_THREADS*PREFETCH_SIZE_WORDS];
        this->reportTails = new ReportItem * volatile [NUM_THREADS*PREFETCH_SIZE_WORDS];
        // head = new NodeWrapper(std::numeric_limits<int>::min())
        this->head = recmgr->template allocate<NodeWrapper>(tid);
#ifdef __HANDLE_STATS
        GSTATS_APPEND(tid, extra_type2_allocated_addresses, ((long long) head)%(1<<12));
#endif
        this->head->init(this->KEY_MIN);
        this->tail = this->head;
//        oldTail = NULL;
        // blocker = new ReportItem(NULL, ReportType::Add, -1)
        this->blocker = recmgr->template allocate<ReportItem>(tid);
#ifdef __HANDLE_STATS
        GSTATS_APPEND(tid, extra_type3_allocated_addresses, ((long long) blocker)%(1<<12));
#endif
        this->blocker->init(NULL, ReportType::Add, -1);
        this->active = true;
        this->currLocations = new void * [NUM_THREADS*PREFETCH_SIZE_WORDS];
        this->currRepLocations = new int[NUM_THREADS*PREFETCH_SIZE_WORDS];
        this->gAllReports = NULL;
        for (int i=0;i<NUM_THREADS;++i) {
            this->reportHeads[i*PREFETCH_SIZE_WORDS] = recmgr->template allocate<ReportItem>(tid);
#ifdef __HANDLE_STATS
            GSTATS_APPEND(tid, extra_type3_allocated_addresses, ((long long) reportHeads[i*PREFETCH_SIZE_WORDS])%(1<<12));
#endif
            this->reportHeads[i*PREFETCH_SIZE_WORDS]->init(NULL, ReportType::Add, -1); // sentinel head.
            this->reportTails[i*PREFETCH_SIZE_WORDS] = this->reportHeads[i*PREFETCH_SIZE_WORDS];
            
            this->currLocations[i*PREFETCH_SIZE_WORDS] = NULL;
            this->currRepLocations[i*PREFETCH_SIZE_WORDS] = 0;
        }
    }
    
    ~SnapCollector() {
        if (reportHeads) delete[] reportHeads;
        if (reportTails) delete[] reportTails;
        if (currLocations) delete[] currLocations;
        if (currRepLocations) delete[] currRepLocations;
        if (gAllReports) delete gAllReports;
    }
    
    template <typename RecordManager>
    void retire(const int tid, RecordManager * recmgr) {
        // retire report items
        for (int i=0;i<NUM_THREADS;++i) {
            ReportItem * curr = reportHeads[i*PREFETCH_SIZE_WORDS];
            while (curr != NULL) {
                if (curr != blocker) recmgr->retire(tid, curr); // blocker can exist in many per-thread lists, but we only want to retire it once, below.
                curr = curr->next;
            }
        }
        // retire blocker
        recmgr->retire(tid, blocker);
        // if a thread has changed tail to point to a "blocker," then
        // threads may have appended node wrappers to the blocker,
        // so we have to retire any such node wrappers
        NodeWrapper * curr = this->tail;
        if (isBlocker(curr)) {
            while (curr != NULL) {
                recmgr->retire(tid, curr);
                curr = curr->next;
            }
        }
        // retire node wrappers
        curr = head;
        while (curr != NULL) { // && curr != tail /*&& curr != oldTail*/) {
            recmgr->retire(tid, curr);
            curr = curr->next;
        }
        // retire the contents of gAllReports
        __retireAllReports(tid, gAllReports, recmgr);
        // retire snap collector
        recmgr->retire(tid, this);
    }
    
    // TIMNAT: Implemented according to the optimization in A.3:
    // TIMNAT: Only accept nodes whose key is higher than the last, and return the last node. 
    template <typename RecordManager>
    NodeType * AddNode(const int tid, NodeType * node, K key, RecordManager * recmgr) {
        NodeWrapper * last = tail;
        if (last->key >= key) // TIMNAT: trying to add an out of place node.
            return last->node;

        // advance tail pointer if needed
        if (last->next != NULL) {
            if (last == tail) __sync_bool_compare_and_swap(&tail, last, last->next);
            return tail->node;
        }
        
        NodeWrapper * newNode = recmgr->template allocate<NodeWrapper>(tid);
#ifdef __HANDLE_STATS
        GSTATS_APPEND(tid, extra_type2_allocated_addresses, ((long long) newNode)%(1<<12));
#endif
        newNode->init(node, key);
        if (__sync_bool_compare_and_swap(&last->next, NULL, newNode)) {
            __sync_bool_compare_and_swap(&tail, last, newNode);
            return node;
        } else {
            recmgr->deallocate(tid, newNode);
            return tail->node;
        }
    }
    
    template <typename RecordManager>
    void Report(int tid, NodeType * Node, ReportType t, K key, RecordManager * recmgr) {
        ReportItem * reportTail = reportTails[tid*PREFETCH_SIZE_WORDS];
        ReportItem * newItem = recmgr->template allocate<ReportItem>(tid);
#ifdef __HANDLE_STATS
        GSTATS_APPEND(tid, extra_type3_allocated_addresses, ((long long) newItem)%(1<<12));
#endif
        newItem->init(Node, t, key);
        if (__sync_bool_compare_and_swap(&reportTail->next, NULL, newItem)) {
            reportTails[tid*PREFETCH_SIZE_WORDS] = newItem;
        } else {
            recmgr->deallocate(tid, newItem);
        }
    }
    
    bool IsActive() {
//        __sync_synchronize();
//        SOFTWARE_BARRIER;
        bool result = active;
//        SOFTWARE_BARRIER;
        return result;
    }
    
    template <typename RecordManager>
    void BlockFurtherPointers(const int tid, RecordManager * recmgr) {
        NodeWrapper * blocker = recmgr->template allocate<NodeWrapper>(tid);
#ifdef __HANDLE_STATS
        GSTATS_APPEND(tid, extra_type2_allocated_addresses, ((long long) blocker)%(1<<12));
#endif
        blocker->init(NULL, KEY_MAX);
        
#if 1
        while (true) {
            NodeWrapper * old = this->tail;
            if (isBlocker(old)) { // old is a blocker, so no need to add our own blocker
                recmgr->deallocate(tid, blocker);
                return;
            }
            if (__sync_bool_compare_and_swap(&this->tail, old, blocker)) {
                return;
            }
        }
#else
        tail = blocker;
#endif
    }
    
    /**
     * note: the parameters are used for the timestamping mechanism of the
     *       test harness. they are NOT inherently needed by the snap collector.
     */
    void Deactivate(pthread_rwlock_t * const rwlock, volatile long long * timestamp, long long * rq_lin_time) {
#ifdef RQ_USE_TIMESTAMPS
        if (pthread_rwlock_wrlock(rwlock)) error("could not write-lock rwlock");
        active = false; // range query is linearized here
        *timestamp = *timestamp + 1;
        *rq_lin_time = *timestamp; //++(*timestamp);
        //std::cout<<"timestamp="<<*timestamp<<std::endl;
        if (pthread_rwlock_unlock(rwlock)) error("could not write-unlock rwlock");
#else
        active = false; // range query is linearized here (no memory barrier needed, since we don't care about read/write re-ordering--just when this hits main memory)
        //__sync_synchronize();
#endif
    }

    void BlockFurtherReports() {
        for (int i = 0; i<NUM_THREADS; i++) {
            ReportItem * reportTail = reportTails[i*PREFETCH_SIZE_WORDS];
            
            // TODO: SAVE OLD TAILS, HERE!!!
            
            if (reportTail->next == NULL)
                __sync_bool_compare_and_swap(&reportTail->next, NULL, blocker);
            // assert cas succeeded OR reportTail->next == blocker
        }
    }

private:
    
    // TIMNAT: What follows is functions that are used to work with the snapshot while it is
    // TIMNAT: already taken. These functions are used to iterate over the nodes of the snapshot.

    template <typename RecordManager>
    void AddReports(const int tid, std::vector<CompactReportItem *> * allReports, ReportItem * curr, RecordManager * recmgr) {
        curr = curr->next;
        while (curr != NULL && curr != blocker) {
            CompactReportItem * newItem = recmgr->template allocate<CompactReportItem>(tid);
#ifdef __HANDLE_STATS
            GSTATS_APPEND(tid, extra_type4_allocated_addresses, ((long long) newItem)%(1<<12));
#endif
            newItem->init(curr->node, curr->t, curr->key);
            allReports->push_back(newItem);
            curr = curr->next;
        }
    }

public:
    // An optimization: sort the reports and nodes.
    template <typename RecordManager>
    void Prepare(int tid, RecordManager * recmgr) {
        currLocations[tid*PREFETCH_SIZE_WORDS] = head;
        currRepLocations[tid*PREFETCH_SIZE_WORDS] = 0;
        if (gAllReports != NULL) return;

        std::vector<CompactReportItem *> * allReports = new std::vector<CompactReportItem *>();
        for (int i = 0; i < NUM_THREADS; i++) {
            AddReports(tid, allReports, reportHeads[i*PREFETCH_SIZE_WORDS], recmgr);
            if (gAllReports != NULL) {
                // failed to publish allReports -- clean it up
                __deallocateAllReports(tid, allReports, recmgr);
                return;
            }
        }
        assert(!active);
#ifdef SNAPCOLLECTOR_PRINT_RQS
        std::cout<<"this="<<(long long) this<<" allReports size="<<allReports->size()<<std::endl;
#endif
        std::sort(allReports->begin(), allReports->end(), compareCRI);
        if (__sync_bool_compare_and_swap(&gAllReports, NULL, allReports)) {
            // published allReports
        } else {
            // failed to publish allReports -- clean it up
            __deallocateAllReports(tid, allReports, recmgr);
        }
    }
    
    NodeType * GetNext(int tid) {
        NodeWrapper * currLoc = (NodeWrapper *) currLocations[tid*PREFETCH_SIZE_WORDS];
        int currRepLoc = currRepLocations[tid*PREFETCH_SIZE_WORDS];
        std::vector<CompactReportItem *> * allReports = gAllReports;

        while (true) {
            CompactReportItem * rep = NULL;
            K repKey = KEY_MAX;
            if (allReports->size() > currRepLoc) {
                rep = (*allReports)[currRepLoc];
                repKey = rep->key;
            }
            K nodeKey = KEY_MAX;
            NodeWrapper * next = currLoc->next;
            if (next != NULL) {
                nodeKey = next->key;
            }

            // Option 1: node key < rep key. Return node.
            if (nodeKey < repKey) {
                currLocations[tid*PREFETCH_SIZE_WORDS] = next;
                currRepLocations[tid*PREFETCH_SIZE_WORDS] = currRepLoc;
                return next->node;
            }

            // Option 2: node key == rep key 
            if (nodeKey == repKey) {
                // 2.a - both are infinity - iteration done.
                if (nodeKey == KEY_MAX) {
                    currLocations[tid*PREFETCH_SIZE_WORDS] = currLoc;
                    currRepLocations[tid*PREFETCH_SIZE_WORDS] = currRepLoc;
                    return NULL;
                }
                // node and report with the same key ::

                // skip not-needed reports
                while (currRepLoc + 1 < allReports->size()) {
                    CompactReportItem * nextRep = (*allReports)[currRepLoc + 1];
                    // dismiss a duplicate, or an insert followed by a matching delete:
                    if (rep->key == nextRep->key && rep->node == nextRep->node) {
                        currRepLoc++;
                        rep = nextRep;
                    } else {
                        break;
                    }
                }
                // standing on an insert report to a node I am holding:
                // 1. Return the current node.
                // 2. Skip over rest of reports for that key.
                if (rep->t == ReportType::Add && (NodeType *) rep->node == next->node) {
                    while (currRepLoc < allReports->size()
                            && (*allReports)[currRepLoc]->key == rep->key) {
                        currRepLoc++;
                    }
                    currRepLocations[tid*PREFETCH_SIZE_WORDS] = currRepLoc;
                    currLocations[tid*PREFETCH_SIZE_WORDS] = next;
                    return next->node;
                }
                // standing on an insert report to a different node than I hold:
                // 1. Return the reported node.
                // 2. Skip over rest of reports for that key.
                if (rep->t == ReportType::Add && (NodeType *) rep->node != next->node) {
                    NodeType * returnValue = (NodeType *) rep->node;
                    while (currRepLoc < allReports->size()
                            && (*allReports)[currRepLoc]->key == rep->key) {
                        currRepLoc++;
                    }
                    currRepLocations[tid*PREFETCH_SIZE_WORDS] = currRepLoc;
                    currLocations[tid*PREFETCH_SIZE_WORDS] = next;
                    return returnValue;
                }
                // standing on a delete report to a different node than I hold:
                // skip over it and continue the big loop.
                if (rep->t == ReportType::Remove && (NodeType *) rep->node != next->node) {
                    currRepLoc++;
                    continue;
                }
                // standing on a delete report to the node that I hold:
                // 1. advance over the node that I hold.
                // 2. advance with the report.
                // 3. continue the bigloop
                currLoc = next;
                currRepLoc++;
                continue;
            }

            // Option 3: node key > rep key
            if (nodeKey > repKey) {
                // skip not-needed reports
                while (currRepLoc + 1 < allReports->size()) {
                    CompactReportItem * nextRep = (*allReports)[currRepLoc + 1];
                    // dismiss a duplicate, or an insert followed by a matching delete:
                    if (rep->key == nextRep->key && rep->node == nextRep->node) {
                        currRepLoc++;
                        rep = nextRep;
                    } else {
                        break;
                    }
                }
                // a delete report - skip over it.
                if (rep->t == ReportType::Remove) {
                    currRepLoc++;
                    continue;
                }

                // an insert report:
                // 1. skip over rest of the reports for the same key.
                // 2. return the node.
                if (rep->t == ReportType::Add) {
                    NodeType * returnValue = (NodeType *) rep->node;
                    while (currRepLoc < allReports->size()
                            && (*allReports)[currRepLoc]->key == rep->key) {
                        currRepLoc++;
                    }
                    currRepLocations[tid*PREFETCH_SIZE_WORDS] = currRepLoc;
                    currLocations[tid*PREFETCH_SIZE_WORDS] = currLoc;
                    return returnValue;
                }
            }
        }
    }

};

#endif /* SNAPCOLLECTOR_H */


================================================
FILE: datastructures/trevor_brown_abtree/common/rq/snapcollector/snapcollector_test.cpp
================================================
/* 
 * File:   test.cpp
 * Author: trbot
 *
 * Created on June 21, 2017, 5:25 PM
 */

#include <iostream>
#include <cstdlib>
#include <cassert>
#include "snapcollector.h"
#include "rq_snapcollector.h"

using namespace std;

class Node {
public:
    int key;
    
    volatile bool marked;
    volatile long long itime;
    volatile long long dtime;
    
    Node(int key) : key(key) {}
};

class DataStructure {
public:
    inline bool isLogicallyDeleted(const int tid, Node * node) {
        return node->marked;
    }
    
    inline int getKeys(const int tid, Node * node, int * const outputKeys) {
        outputKeys[0] = node->key;
        return 1;
    }
    
    bool isInRange(const int& key, const int& lo, const int& hi) {
        return lo <= key && key <= hi;
    }
};

/*
 * 
 */
int main(int argc, char** argv) {
    DataStructure ds;
    Node node (17);
    const int numProcessors = 1;
    SnapCollector<Node> sc (numProcessors);
    sc.AddNode(&node, node.key);
    
    RQProvider<int, Node, DataStructure, true, true> prov (numProcessors, &ds);
    Node * inserted[] = {NULL};
    Node * deleted[] = {NULL};
    prov.linearize_update_at_cas(1, &node.key, 17, 18, inserted, deleted, (void *) NULL);
    
    return 0;
}


================================================
FILE: datastructures/trevor_brown_abtree/common/rwlock.h
================================================
/* 
 * File:   rwlock.h
 * Author: trbot
 *
 * Created on June 29, 2017, 8:25 PM
 */

#ifndef RWLOCK_H
#define RWLOCK_H

#ifdef RWLOCK_PTHREADS
#elif defined RWLOCK_FAVOR_WRITERS
#elif defined RWLOCK_FAVOR_READERS
#else
//    #warning "No RWLOCK implementation specified... using default: favour READERS. See rwlock.h for options. Note that this setting only affects algorithms that use the lock-based range query provider in common/rq/rq_rwlock.h."
    #define RWLOCK_FAVOR_READERS
//    #error Must specify RWLOCK implementation; see rwlock.h
#endif

#ifdef RWLOCK_PTHREADS

class RWLock {
private:
    pthread_rwlock_t lock;
    
public:
    RWLock() {
        if (pthread_rwlock_init(&lock, NULL)) error("could not init rwlock");
    }
    ~RWLock() {
        if (pthread_rwlock_destroy(&lock)) error("could not destroy rwlock");
    }
    inline void readLock() {
        if (pthread_rwlock_rdlock(&lock)) error("could not read-lock rwlock");
    }
    inline void readUnlock() {
        if (pthread_rwlock_unlock(&lock)) error("could not read-unlock rwlock");
    }
    inline void writeLock() {
        if (pthread_rwlock_wrlock(&lock)) error("could not write-lock rwlock");
    }
    inline void writeUnlock() {
        if (pthread_rwlock_unlock(&lock)) error("could not write-unlock rwlock");
    }
    inline bool isWriteLocked() {
        cout<<"ERROR: isWriteLocked() is not implemented"<<endl;
        exit(-1);
    }
    inline bool isReadLocked() {
        cout<<"ERROR: isReadLocked() is not implemented"<<endl;
        exit(-1);
    }
    inline bool isLocked() {
        cout<<"ERROR: isReadLocked() is not implemented"<<endl;
        exit(-1);
    }
};

#elif defined RWLOCK_FAVOR_WRITERS

class RWLock {
private:
    volatile long long lock; // two bit fields: [ number of readers ] [ writer bit ]
    
public:
    RWLock() {
        lock = 0;
    }
    inline bool isWriteLocked() {
        return lock & 1;
    }
    inline bool isReadLocked() {
        return lock & ~1;
    }
    inline bool isLocked() {
        return lock;
    }
    inline void readLock() {
        while (1) {
            while (isLocked()) {}
            if ((__sync_add_and_fetch(&lock, 2) & 1) == 0) return; // when we tentatively read-locked, there was no writer
            __sync_add_and_fetch(&lock, -2); // release our tentative read-lock
        }
    }
    inline void readUnlock() {
        __sync_add_and_fetch(&lock, -2);
    }
    inline void writeLock() {
        while (1) {
            long long v = lock;
            if (__sync_bool_compare_and_swap(&lock, v & ~1, v | 1)) {
                while (v & ~1) { // while there are still readers
                    v = lock;
                }
                return;
            }
        }
    }
    inline void writeUnlock() {
        __sync_add_and_fetch(&lock, -1);
    }
};

#elif defined RWLOCK_FAVOR_READERS

class RWLock {
private:
    volatile long long lock; // two bit fields: [ number of readers ] [ writer bit ]
    
public:
    RWLock() {
        lock = 0;
    }
    inline bool isWriteLocked() {
        return lock & 1;
    }
    inline bool isReadLocked() {
        return lock & ~1;
    }
    inline bool isLocked() {
        return lock;
    }
    inline void readLock() {
        while (1) {
            __sync_add_and_fetch(&lock, 2);
            while (isWriteLocked());
            return;
        }
    }
    inline void readUnlock() {
        __sync_add_and_fetch(&lock, -2);
    }
    inline void writeLock() {
        while (1) {
            while (isLocked()) {}
            if (__sync_bool_compare_and_swap(&lock, 0, 1)) {
                return;
            }
        }
    }
    inline void writeUnlock() {
        __sync_add_and_fetch(&lock, -1);
    }
};

#endif

#endif /* RWLOCK_H */


================================================
FILE: datastructures/trevor_brown_abtree/ds/brown_ext_abtree_lf/brown_ext_abtree_lf.h
================================================
/**
 * Implementation of the dictionary ADT with a lock-free relaxed (a,b)-tree.
 * Copyright (C) 2016 Trevor Brown
 * Contact (me [at] tbrown [dot] pro) with questions or comments.
 *
 * Details of the algorithm appear in Trevor's thesis:
 *    Techniques for Constructing Efficient Lock-free Data Structures. 2017.
 * 
 * The paper leaves it up to the implementer to decide when and how to perform
 * rebalancing steps. In this implementation, we keep track of violations and
 * fix them using a recursive cleanup procedure, which is designed as follows.
 * After performing a rebalancing step that replaced a set R of nodes,
 * recursive invocations are made for every violation that appears at a newly
 * created node. Thus, any violations that were present at nodes in R are either
 * eliminated by the rebalancing step, or will be fixed by recursive calls.
 * This way, if an invocation I of this cleanup procedure is trying to fix a
 * violation at a node that has been replaced by another invocation I' of
 * cleanup, then I can hand off responsibility for fixing the violation to I'.
 * Designing the rebalancing procedure to allow responsibility to be handed
 * off in this manner is not difficult; it simply requires going through each
 * rebalancing step S and determining which nodes involved in S can have
 * violations after S (and then making a recursive call for each violation).
 * 
 * -----------------------------------------------------------------------------
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#ifndef ABTREE_H
#define	ABTREE_H

#include <string>
#include <cstring>
#include <fstream>
#include <iostream>
#include <sstream>
#include <set>
#include <unistd.h>
#include <sys/types.h>
#include "record_manager.h"
#include "descriptors.h"
#include "rq_provider.h"

namespace abtree_ns {

    #ifndef TRACE
    #define TRACE if(0)
    #endif
    #ifndef DEBUG
    #define DEBUG if(0)
    #endif
    #ifndef DEBUG1
    #define DEBUG1 if(0)
    #endif
    #ifndef DEBUG2
    #define DEBUG2 if(0)
    #endif

    #define ABTREE_ENABLE_DESTRUCTOR

    using namespace std;
    
    template <int DEGREE, typename K>
    struct Node;
    
    template <int DEGREE, typename K>
    struct SCXRecord;
    
    template <int DEGREE, typename K>
    class wrapper_info {
    public:
        const static int MAX_NODES = DEGREE+2;
        Node<DEGREE,K> * nodes[MAX_NODES];
        SCXRecord<DEGREE,K> * scxPtrs[MAX_NODES];
        Node<DEGREE,K> * newNode;
        Node<DEGREE,K> * volatile * field;
        int state;
        char numberOfNodes;
        char numberOfNodesToFreeze;
        char numberOfNodesAllocated;

        // for rqProvider
        Node<DEGREE,K> * insertedNodes[MAX_NODES+1];
        Node<DEGREE,K> * deletedNodes[MAX_NODES+1];
    };

    template <int DEGREE, typename K>
    struct SCXRecord {
        const static int STATE_INPROGRESS = 0;
        const static int STATE_COMMITTED = 1;
        const static int STATE_ABORTED = 2;
        union {
            struct {
                volatile mutables_t mutables;

                int numberOfNodes;
                int numberOfNodesToFreeze;

                Node<DEGREE,K> * newNode;
                Node<DEGREE,K> * volatile * field;
                Node<DEGREE,K> * nodes[wrapper_info<DEGREE,K>::MAX_NODES];            // array of pointers to nodes
                SCXRecord<DEGREE,K> * scxPtrsSeen[wrapper_info<DEGREE,K>::MAX_NODES]; // array of pointers to scx records

                // for rqProvider
                Node<DEGREE,K> * insertedNodes[wrapper_info<DEGREE,K>::MAX_NODES+1];
                Node<DEGREE,K> * deletedNodes[wrapper_info<DEGREE,K>::MAX_NODES+1];
            } __attribute__((packed)) c; // WARNING: be careful with atomicity because of packed attribute!!! (this means no atomic vars smaller than word size, and all atomic vars must start on a word boundary when fields are packed tightly)
            char bytes[2*PREFETCH_SIZE_BYTES];
        };
        const static int size = sizeof(c);
    };
        
    template <int DEGREE, typename K>
    struct Node {
        SCXRecord<DEGREE,K> * volatile scxPtr;
        int leaf; // 0 or 1
        volatile int marked; // 0 or 1
        int weight; // 0 or 1
        int size; // degree of node
        K searchKey;
#if defined(RQ_LOCKFREE) || defined(RQ_RWLOCK) || defined(HTM_RQ_RWLOCK)
        volatile long long itime; // for use by range query algorithm
        volatile long long dtime; // for use by range query algorithm
#endif
        K keys[DEGREE];
        Node<DEGREE,K> * volatile ptrs[DEGREE];

        inline bool isLeaf() {
            return leaf;
        }
        inline int getKeyCount() {
            return isLeaf() ? size : size-1;
        }
        inline int getABDegree() {
            return size;
        }
        template <class Compare>
        inline int getChildIndex(const K& key, Compare cmp) {
            int nkeys = getKeyCount();
            int retval = 0;
            while (retval < nkeys && !cmp(key, (const K&) keys[retval])) {
                ++retval;
            }
            return retval;
        }
        template <class Compare>
        inline int getKeyIndex(const K& key, Compare cmp) {
            int nkeys = getKeyCount();
            int retval = 0;
            while (retval < nkeys && cmp((const K&) keys[retval], key)) {
                ++retval;
            }
            return retval;
        }
    };

    template <int DEGREE, typename K, class Compare, class RecManager>
    class abtree {

        // the following bool determines whether the optimization to guarantee
        // amortized constant rebalancing (at the cost of decreasing average degree
        // by at most one) is used.
        // if it is false, then an amortized logarithmic number of rebalancing steps
        // may be performed per operation, but average degree increases slightly.
        char padding0[PREFETCH_SIZE_BYTES];
        const bool ALLOW_ONE_EXTRA_SLACK_PER_NODE;

        const int b;
        const int a;

        RecManager * const recordmgr;
        RQProvider<K, void *, Node<DEGREE,K>, abtree<DEGREE,K,Compare,RecManager>, RecManager, false, false> * const rqProvider;
        char padding1[PREFETCH_SIZE_BYTES];
        Compare cmp;

        // descriptor reduction algorithm
        #ifndef comma
            #define comma ,
        #endif
        #define DESC1_ARRAY records
        #define DESC1_T SCXRecord<DEGREE comma K>
        #define MUTABLES1_OFFSET_ALLFROZEN 0
        #define MUTABLES1_OFFSET_STATE 1
        #define MUTABLES1_MASK_ALLFROZEN 0x1
        #define MUTABLES1_MASK_STATE 0x6
        #define MUTABLES1_NEW(mutables) \
            ((((mutables)&MASK1_SEQ)+(1<<OFFSET1_SEQ)) \
            | (SCXRecord<DEGREE comma K>::STATE_INPROGRESS<<MUTABLES1_OFFSET_STATE))
        #define MUTABLES1_INIT_DUMMY SCXRecord<DEGREE comma K>::STATE_COMMITTED<<MUTABLES1_OFFSET_STATE | MUTABLES1_MASK_ALLFROZEN<<MUTABLES1_OFFSET_ALLFROZEN
        #include "../descriptors/descriptors_impl.h"
        char __padding_desc[PREFETCH_SIZE_BYTES];
        DESC1_T DESC1_ARRAY[LAST_TID1+1] __attribute__ ((aligned(64)));

        char padding2[PREFETCH_SIZE_BYTES];
        Node<DEGREE,K> * entry;
        char padding3[PREFETCH_SIZE_BYTES];

        #define DUMMY       ((SCXRecord<DEGREE,K>*) (void*) TAGPTR1_STATIC_DESC(0))
        #define FINALIZED   ((SCXRecord<DEGREE,K>*) (void*) TAGPTR1_DUMMY_DESC(1))
        #define FAILED      ((SCXRecord<DEGREE,K>*) (void*) TAGPTR1_DUMMY_DESC(2))

        #define arraycopy(src, srcStart, dest, destStart, len) \
            for (int ___i=0;___i<(len);++___i) { \
                (dest)[(destStart)+___i] = (src)[(srcStart)+___i]; \
            }
        #define arraycopy_ptrs(src, srcStart, dest, destStart, len) \
            for (int ___i=0;___i<(len);++___i) { \
                rqProvider->write_addr(tid, &(dest)[(destStart)+___i], \
                        rqProvider->read_addr(tid, &(src)[(srcStart)+___i])); \
            }

    private:
        void* doInsert(const int tid, const K& key, void * const value, const bool replace);

        // returns true if the invocation of this method
        // (and not another invocation of a method performed by this method)
        // performed an scx, and false otherwise
        bool fixWeightViolation(const int tid, Node<DEGREE,K>* viol);

        // returns true if the invocation of this method
        // (and not another invocation of a method performed by this method)
        // performed an scx, and false otherwise
        bool fixDegreeViolation(const int tid, Node<DEGREE,K>* viol);

        bool llx(const int tid, Node<DEGREE,K>* r, Node<DEGREE,K> ** snapshot, const int i, SCXRecord<DEGREE,K> ** ops, Node<DEGREE,K> ** nodes);
        SCXRecord<DEGREE,K>* llx(const int tid, Node<DEGREE,K>* r, Node<DEGREE,K> ** snapshot);
        bool scx(const int tid, wrapper_info<DEGREE,K> * info);
        void helpOther(const int tid, tagptr_t tagptr);
        int help(const int tid, const tagptr_t tagptr, SCXRecord<DEGREE,K> const * const snap, const bool helpingOther);

        SCXRecord<DEGREE,K>* createSCXRecord(const int tid, wrapper_info<DEGREE,K> * info);
        Node<DEGREE,K>* allocateNode(const int tid);

        void freeSubtree(Node<DEGREE,K>* node, int* nodes) {
            const int tid = 0;
            if (node == NULL) return;
            if (!node->isLeaf()) {
                for (int i=0;i<node->getABDegree();++i) {
                    freeSubtree(node->ptrs[i], nodes);
                }
            }
            ++(*nodes);
            recordmgr->retire(tid, node);
        }

        int init[MAX_TID_POW2] = {0,};
public:
        void * const NO_VALUE;
        const int NUM_PROCESSES;
    #ifdef USE_DEBUGCOUNTERS
        debugCounters * const counters; // debug info
    #endif

        /**
         * This function must be called once by each thread that will
         * invoke any functions on this class.
         * 
         * It must be okay that we do this with the main thread and later with another thread!
         */
        void initThread(const int tid) {
            if (init[tid]) return; else init[tid] = !init[tid];
            
            recordmgr->initThread(tid);
            rqProvider->initThread(tid);
        }
        void deinitThread(const int tid) {
            if (!init[tid]) return; else init[tid] = !init[tid];

            rqProvider->deinitThread(tid);
            recordmgr->deinitThread(tid);
        }

        /**
         * Creates a new relaxed (a,b)-tree wherein: <br>
         *      each internal node has up to <code>DEGREE</code> child pointers, and <br>
         *      each leaf has up to <code>DEGREE</code> key/value pairs, and <br>
         *      keys are ordered according to the provided comparator.
         */
        abtree(const int numProcesses, 
                const K anyKey,
                int suspectedCrashSignal = SIGQUIT)
        : ALLOW_ONE_EXTRA_SLACK_PER_NODE(true)
        , b(DEGREE)
        , a(DEGREE/2 - 2)
        , recordmgr(new RecManager(numProcesses, suspectedCrashSignal))
        , rqProvider(new RQProvider<K, void *, Node<DEGREE,K>, abtree<DEGREE,K,Compare,RecManager>, RecManager, false, false>(numProcesses, this, recordmgr))
        , NO_VALUE((void *) -1LL)
        , NUM_PROCESSES(numProcesses) 
        {
            cmp = Compare();
            
            const int tid = 0;
            initThread(tid);

            recordmgr->enterQuiescentState(tid);
            
            DESC1_INIT_ALL(numProcesses);

            SCXRecord<DEGREE,K> *dummy = TAGPTR1_UNPACK_PTR(DUMMY);
            dummy->c.mutables = MUTABLES1_INIT_DUMMY;
            TRACE COUTATOMICTID("DUMMY mutables="<<dummy->c.mutables<<endl);

            // initial tree: entry is a sentinel node (with one pointer and no keys)
            //               that points to an empty node (no pointers and no keys)
            Node<DEGREE,K>* _entryLeft = allocateNode(tid);
            _entryLeft->scxPtr = DUMMY;
            _entryLeft->leaf = true;
            _entryLeft->marked = false;
            _entryLeft->weight = true;
            _entryLeft->size = 0;
            _entryLeft->searchKey = anyKey;

            Node<DEGREE,K>* _entry = allocateNode(tid);
            _entry = allocateNode(tid);
            _entry->scxPtr = DUMMY;
            _entry->leaf = false;
            _entry->marked = false;
            _entry->weight = true;
            _entry->size = 1;
            _entry->searchKey = anyKey;
            _entry->ptrs[0] = _entryLeft;
            
            // need to simulate real insertion of root and the root's child,
            // since range queries will actually try to add these nodes,
            // and we don't want blocking rq providers to spin forever
            // waiting for their itimes to be set to a positive number.
            Node<DEGREE,K>* insertedNodes[] = {_entry, _entryLeft, NULL};
            Node<DEGREE,K>* deletedNodes[] = {NULL};
            rqProvider->linearize_update_at_write(tid, &entry, _entry, insertedNodes, deletedNodes);
        }

    #ifdef ABTREE_ENABLE_DESTRUCTOR    
        ~abtree() {
            int nodes = 0;
            freeSubtree(entry, &nodes);
//            COUTATOMIC("main thread: deleted tree containing "<<nodes<<" nodes"<<endl);
            delete rqProvider;
//            recordmgr->printStatus();
            delete recordmgr;
        }
    #endif

        Node<DEGREE,K> * debug_getEntryPoint() { return entry; }

    private:
        /*******************************************************************
         * Utility functions for integration with the test harness
         *******************************************************************/

        int sequentialSize(Node<DEGREE,K>* node) {
            if (node->isLeaf()) {
                return node->getKeyCount();
            }
            int retval = 0;
            for (int i=0;i<node->getABDegree();++i) {
                Node<DEGREE,K>* child = node->ptrs[i];
                retval += sequentialSize(child);
            }
            return retval;
        }
        int sequentialSize() {
            return sequentialSize(entry->ptrs[0]);
        }

        int getNumberOfLeaves(Node<DEGREE,K>* node) {
            if (node == NULL) return 0;
            if (node->isLeaf()) return 1;
            int result = 0;
            for (int i=0;i<node->getABDegree();++i) {
                result += getNumberOfLeaves(node->ptrs[i]);
            }
            return result;
        }
        const int getNumberOfLeaves() {
            return getNumberOfLeaves(entry->ptrs[0]);
        }
        int getNumberOfInternals(Node<DEGREE,K>* node) {
            if (node == NULL) return 0;
            if (node->isLeaf()) return 0;
            int result = 1;
            for (int i=0;i<node->getABDegree();++i) {
                result += getNumberOfInternals(node->ptrs[i]);
            }
            return result;
        }
        const int getNumberOfInternals() {
            return getNumberOfInternals(entry->ptrs[0]);
        }
        const int getNumberOfNodes() {
            return getNumberOfLeaves() + getNumberOfInternals();
        }

        int getSumOfKeyDepths(Node<DEGREE,K>* node, int depth) {
            if (node == NULL) return 0;
            if (node->isLeaf()) return depth * node->getKeyCount();
            int result = 0;
            for (int i=0;i<node->getABDegree();i++) {
                result += getSumOfKeyDepths(node->ptrs[i], 1+depth);
            }
            return result;
        }
        const int getSumOfKeyDepths() {
            return getSumOfKeyDepths(entry->ptrs[0], 0);
        }
        const double getAverageKeyDepth() {
            long sz = sequentialSize();
            return (sz == 0) ? 0 : getSumOfKeyDepths() / sz;
        }

        int getHeight(Node<DEGREE,K>* node, int depth) {
            if (node == NULL) return 0;
            if (node->isLeaf()) return 0;
            int result = 0;
            for (int i=0;i<node->getABDegree();i++) {
                int retval = getHeight(node->ptrs[i], 1+depth);
                if (retval > result) result = retval;
            }
            return result+1;
        }
        const int getHeight() {
            return getHeight(entry->ptrs[0], 0);
        }

        int getKeyCount(Node<DEGREE,K>* entry) {
            if (entry == NULL) return 0;
            if (entry->isLeaf()) return entry->getKeyCount();
            int sum = 0;
            for (int i=0;i<entry->getABDegree();++i) {
                sum += getKeyCount(entry->ptrs[i]);
            }
            return sum;
        }
        int getTotalDegree(Node<DEGREE,K>* entry) {
            if (entry == NULL) return 0;
            int sum = entry->getKeyCount();
            if (entry->isLeaf()) return sum;
            for (int i=0;i<entry->getABDegree();++i) {
                sum += getTotalDegree(entry->ptrs[i]);
            }
            return 1+sum; // one more children than keys
        }
        int getNodeCount(Node<DEGREE,K>* entry) {
            if (entry == NULL) return 0;
            if (entry->isLeaf()) return 1;
            int sum = 1;
            for (int i=0;i<entry->getABDegree();++i) {
                sum += getNodeCount(entry->ptrs[i]);
            }
            return sum;
        }
        double getAverageDegree() {
            return getTotalDegree(entry) / (double) getNodeCount(entry);
        }
        double getSpacePerKey() {
            return getNodeCount(entry)*2*b / (double) getKeyCount(entry);
        }

        long long getSumOfKeys(Node<DEGREE,K>* node) {
            TRACE COUTATOMIC("  getSumOfKeys("<<node<<"): isLeaf="<<node->isLeaf()<<endl);
            long long sum = 0;
            if (node->isLeaf()) {
                TRACE COUTATOMIC("      leaf sum +=");
                for (int i=0;i<node->getKeyCount();++i) {
                    sum += (long long) node->keys[i];
                    TRACE COUTATOMIC(node->keys[i]);
                }
                TRACE COUTATOMIC(endl);
            } else {
                for (int i=0;i<node->getABDegree();++i) {
                    sum += getSumOfKeys(node->ptrs[i]);
                }
            }
            TRACE COUTATOMIC("  getSumOfKeys("<<node<<"): sum="<<sum<<endl);
            return sum;
        }
        long long getSumOfKeys() {
            TRACE COUTATOMIC("getSumOfKeys()"<<endl);
            return getSumOfKeys(entry);
        }

        void abtree_error(string s) {
            cerr<<"ERROR: "<<s<<endl;
            exit(-1);
        }

        void debugPrint() {
            cout<<"averageDegree="<<getAverageDegree()<<endl;
            cout<<"averageDepth="<<getAverageKeyDepth()<<endl;
            cout<<"height="<<getHeight()<<endl;
            cout<<"internalNodes="<<getNumberOfInternals()<<endl;
            cout<<"leafNodes="<<getNumberOfLeaves()<<endl;
        }

    public:
        void * const insert(const int tid, const K& key, void * const val) {
            return doInsert(tid, key, val, true);
        }
        void * const insertIfAbsent(const int tid, const K& key, void * const val) {
            return doInsert(tid, key, val, false);
        }
        const pair<void*,bool> erase(const int tid, const K& key);
        const pair<void*,bool> find(const int tid, const K& key);
        bool contains(const int tid, const K& key);
        int rangeQuery(const int tid, const K& low, const K& hi, K * const resultKeys, void ** const resultValues);
        bool validate(const long long keysum, const bool checkkeysum) {
            if (checkkeysum) {
                long long treekeysum = getSumOfKeys();
                if (treekeysum != keysum) {
                    cerr<<"ERROR: tree keysum "<<treekeysum<<" did not match thread keysum "<<keysum<<endl;
                    return false;
                }
            }
            return true;
        }

        /**
         * BEGIN FUNCTIONS FOR RANGE QUERY SUPPORT
         */

        inline bool isLogicallyDeleted(const int tid, Node<DEGREE,K> * node) {
            return false;
        }

        inline int getKeys(const int tid, Node<DEGREE,K> * node, K * const outputKeys, void ** const outputValues) {
            if (node->isLeaf()) {
                // leaf ==> its keys are in the set.
                const int sz = node->getKeyCount();
                for (int i=0;i<sz;++i) {
                    outputKeys[i] = node->keys[i];
                    outputValues[i] = (void *) node->ptrs[i];
                }
                return sz;
            }
            // note: internal ==> its keys are NOT in the set
            return 0;
        }

        bool isInRange(const K& key, const K& lo, const K& hi) {
            return (!cmp(key, lo) && !cmp(hi, key));
        }

        /**
         * END FUNCTIONS FOR RANGE QUERY SUPPORT
         */

        long long getSizeInNodes() {
            return getNumberOfNodes();
        }
        string getSizeString() {
            stringstream ss;
            int preallocated = wrapper_info<DEGREE,K>::MAX_NODES * recordmgr->NUM_PROCESSES;
            ss<<getSizeInNodes()<<" nodes in tree";
            return ss.str();
        }
        long long getSize(Node<DEGREE,K> * node) {
            return sequentialSize(node);
        }
        long long getSize() {
            return sequentialSize();
        }
        RecManager * const debugGetRecMgr() {
            return recordmgr;
        }
        long long debugKeySum() {
            return getSumOfKeys();
        }
    };
} // namespace

#endif	/* ABTREE_H */


================================================
FILE: datastructures/trevor_brown_abtree/ds/brown_ext_abtree_lf/brown_ext_abtree_lf_adapter.h
================================================
/* 
 * File:   bst_adapter.h
 * Author: trbot
 *
 * Created on August 31, 2017, 6:53 PM
 */

#ifndef BST_ADAPTER_H
#define BST_ADAPTER_H

#include <iostream>
#include "brown_ext_abtree_lf_impl.h"
#include "errors.h"
using namespace abtree_ns;

#define RECORD_MANAGER_T record_manager<Reclaim, Alloc, Pool, Node<DEGREE, K>>
#define DATA_STRUCTURE_T abtree<DEGREE, K, std::less<K>, RECORD_MANAGER_T>

template <int DEGREE, typename K, class Reclaim = reclaimer_debra<K>, class Alloc = allocator_new<K>, class Pool = pool_none<K>>
class ds_adapter {
private:
    const void * NO_VALUE;
    DATA_STRUCTURE_T * const ds;

public:
    ds_adapter(const int numThreads, const K ANY_KEY)
    : ds(new DATA_STRUCTURE_T(numThreads, ANY_KEY))
    {}
    ~ds_adapter() {
        delete ds;
    }
    
    void * getNoValue() {
        return ds->NO_VALUE;
    }
    
    void initThread(const int tid) {
        ds->initThread(tid);
    }
    void deinitThread(const int tid) {
        ds->deinitThread(tid);
    }

    bool contains(const int tid, const K& key) {
        return ds->contains(tid, key);
    }
    void * const insert(const int tid, const K& key, void * const val) {
        return ds->insert(tid, key, val);
    }
    void * const insertIfAbsent(const int tid, const K& key, void * const val) {
        return ds->insertIfAbsent(tid, key, val);
    }
    void * const erase(const int tid, const K& key) {
        return ds->erase(tid, key).first;
    }
    void * find(const int tid, const K& key) {
        return ds->find(tid, key).first;
    }
    int rangeQuery(const int tid, const K& lo, const K& hi, K * const resultKeys, void ** const resultValues) {
        return ds->rangeQuery(tid, lo, hi, resultKeys, resultValues);
    }
    /**
     * Sequential operation to get the number of keys in the set
     */
    int getSize() {
        return ds->getSize();
    }
    void printSummary() {
        stringstream ss;
        ss<<ds->getSizeInNodes()<<" nodes in tree";
        cout<<ss.str()<<endl;
        
        auto recmgr = ds->debugGetRecMgr();
        recmgr->printStatus();
    }
    long long getKeyChecksum() {
        return ds->debugKeySum();
    }
    bool validateStructure() {
        return true;
    }
    void printObjectSizes() {
        std::cout<<"sizes: node="
                 <<(sizeof(Node<DEGREE, K>))
                 <<" descriptor="<<(sizeof(SCXRecord<DEGREE, K>))<<" (statically allocated)"
                 <<std::endl;
    }
};

#endif


================================================
FILE: datastructures/trevor_brown_abtree/ds/brown_ext_abtree_lf/brown_ext_abtree_lf_impl.h
================================================
/**
 * Implementation of the dictionary ADT with a lock-free relaxed (a,b)-tree.
 * Copyright (C) 2016 Trevor Brown
 * Contact (me [at] tbrown [dot] pro) with questions or comments.
 *
 * Details of the algorithm appear in Trevor's thesis:
 *    Techniques for Constructing Efficient Lock-free Data Structures. 2017.
 * 
 * The paper leaves it up to the implementer to decide when and how to perform
 * rebalancing steps. In this implementation, we keep track of violations and
 * fix them using a recursive cleanup procedure, which is designed as follows.
 * After performing a rebalancing step that replaced a set R of nodes,
 * recursive invocations are made for every violation that appears at a newly
 * created node. Thus, any violations that were present at nodes in R are either
 * eliminated by the rebalancing step, or will be fixed by recursive calls.
 * This way, if an invocation I of this cleanup procedure is trying to fix a
 * violation at a node that has been replaced by another invocation I' of
 * cleanup, then I can hand off responsibility for fixing the violation to I'.
 * Designing the rebalancing procedure to allow responsibility to be handed
 * off in this manner is not difficult; it simply requires going through each
 * rebalancing step S and determining which nodes involved in S can have
 * violations after S (and then making a recursive call for each violation).
 * 
 * -----------------------------------------------------------------------------
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

/**
 * Implementation note:
 * The ptrs arrays of internal nodes may be modified by calls to
 * rqProvider->linearize_update_at_cas or ->linearize_update_at_write.
 * Consequently, we must access access entries in the ptrs arrays of INTERNAL
 * nodes by performing calls to read_addr and write_addr (and linearize_...).
 * 
 * However, the ptrs arrays of leaves represent fundamentally different data:
 * specifically values, or pointers to values, and NOT pointers to nodes.
 * Thus, the ptrs arrays of leaves CANNOT be modified by such calls.
 * So, we do NOT use these functions to access entries in leaves' ptrs arrays.
 */

#ifndef ABTREE_IMPL_H
#define	ABTREE_IMPL_H

#include "brown_ext_abtree_lf.h"

#define eassert(x, y) if ((x) != (y)) { cout<<"ERROR: "<<#x<<" != "<<#y<<" :: "<<#x<<"="<<x<<" "<<#y<<"="<<y<<endl; exit(-1); }

template <int DEGREE, typename K, class Compare, class RecManager>
abtree_ns::SCXRecord<DEGREE,K> * abtree_ns::abtree<DEGREE,K,Compare,RecManager>::createSCXRecord(const int tid, wrapper_info<DEGREE,K> * info) {
    
    SCXRecord<DEGREE,K> * result = DESC1_NEW(tid);
    result->c.newNode = info->newNode;
    for (int i=0;i<info->numberOfNodes;++i) {
        result->c.nodes[i] = info->nodes[i];
    }
    for (int i=0;i<info->numberOfNodesToFreeze;++i) {
        result->c.scxPtrsSeen[i] = info->scxPtrs[i];
    }
    
    int i;
    for (i=0;info->insertedNodes[i];++i) result->c.insertedNodes[i] = info->insertedNodes[i];
    result->c.insertedNodes[i] = NULL;
    for (i=0;info->deletedNodes[i];++i) result->c.deletedNodes[i] = info->deletedNodes[i];
    result->c.deletedNodes[i] = NULL;
    
    result->c.field = info->field;
    result->c.numberOfNodes = info->numberOfNodes;
    result->c.numberOfNodesToFreeze = info->numberOfNodesToFreeze;
    DESC1_INITIALIZED(tid);
    return result;
}

template <int DEGREE, typename K, class Compare, class RecManager>
abtree_ns::Node<DEGREE,K> * abtree_ns::abtree<DEGREE,K,Compare,RecManager>::allocateNode(const int tid) {
    Node<DEGREE,K> *newnode = recordmgr->template allocate<Node<DEGREE,K> >(tid);
    if (newnode == NULL) {
        COUTATOMICTID("ERROR: could not allocate node"<<endl);
        exit(-1);
    }
    rqProvider->init_node(tid, newnode);
#ifdef __HANDLE_STATS
    GSTATS_APPEND(tid, node_allocated_addresses, ((long long) newnode)%(1<<12));
#endif
    return newnode;
}

/**
 * Returns the value associated with key, or NULL if key is not present.
 */
template <int DEGREE, typename K, class Compare, class RecManager>
const pair<void*,bool> abtree_ns::abtree<DEGREE,K,Compare,RecManager>::find(const int tid, const K& key) {
    pair<void*,bool> result;
    this->recordmgr->leaveQuiescentState(tid);
    Node<DEGREE,K> * l = rqProvider->read_addr(tid, &entry->ptrs[0]);
    while (!l->isLeaf()) {
        int ix = l->getChildIndex(key, cmp);
        l = rqProvider->read_addr(tid, &l->ptrs[ix]);
    }
    int index = l->getKeyIndex(key, cmp);
    if (index < l->getKeyCount() && l->keys[index] == key) {
        result.first = l->ptrs[index]; // this is a value, not a pointer, so it cannot be modified by rqProvider->linearize_update_at_..., so we do not use read_addr
        result.second = true;
    } else {
        result.first = NO_VALUE;
        result.second = false;
    }
    this->recordmgr->enterQuiescentState(tid);
    return result;
}

template <int DEGREE, typename K, class Compare, class RecManager>
bool abtree_ns::abtree<DEGREE,K,Compare,RecManager>::contains(const int tid, const K& key) {
    return find(tid, key).second;
}

template<int DEGREE, typename K, class Compare, class RecManager>
int abtree_ns::abtree<DEGREE,K,Compare,RecManager>::rangeQuery(const int tid, const K& lo, const K& hi, K * const resultKeys, void ** const resultValues) {
    block<Node<DEGREE,K>> stack (NULL);
    recordmgr->leaveQuiescentState(tid);
    rqProvider->traversal_start(tid);

    // depth first traversal (of interesting subtrees)
    int size = 0;
    TRACE COUTATOMICTID("rangeQuery(lo="<<lo<<", hi="<<hi<<", size="<<(hi-lo+1)<<")"<<endl);

    stack.push(entry);
    while (!stack.isEmpty()) {
        Node<DEGREE,K> * node = stack.pop();
        assert(node);
        
        // if leaf node, check if we should add its keys to the traversal
        if (node->isLeaf()) {
            rqProvider->traversal_try_add(tid, node, resultKeys, resultValues, &size, lo, hi);
            
        // else if internal node, explore its children
        } else {
            // find right-most sub-tree that could contain a key in [lo, hi]
            int nkeys = node->getKeyCount();
            int r = nkeys;
            while (r > 0 && cmp(hi, (const K&) node->keys[r-1])) --r;           // subtree rooted at node->ptrs[r] contains only keys > hi

            // find left-most sub-tree that could contain a key in [lo, hi]
            int l = 0;
            while (l < nkeys && !cmp(lo, (const K&) node->keys[l])) ++l;        // subtree rooted at node->ptrs[l] contains only keys < lo

            // perform DFS from left to right (so push onto stack from right to left)
            for (int i=r;i>=l; --i) stack.push(rqProvider->read_addr(tid, &node->ptrs[i]));

//            // simply explore EVERYTHING
//            for (int i=0;i<node->getABDegree();++i) {
//                stack.push(rqProvider->read_addr(tid, &node->ptrs[i]));
//            }
        }
    }
    
    // success
    rqProvider->traversal_end(tid, resultKeys, resultValues, &size, lo, hi);
    recordmgr->enterQuiescentState(tid);
    return size;
}


template <int DEGREE, typename K, class Compare, class RecManager>
void* abtree_ns::abtree<DEGREE,K,Compare,RecManager>::doInsert(const int tid, const K& key, void * const value, const bool replace) {
    wrapper_info<DEGREE,K> _info;
    wrapper_info<DEGREE,K>* info = &_info;
    while (true) {
        /**
         * search
         */
        this->recordmgr->leaveQuiescentState(tid);
        Node<DEGREE,K>* gp = NULL;
        Node<DEGREE,K>* p = entry;
        Node<DEGREE,K>* l = rqProvider->read_addr(tid, &p->ptrs[0]);
        int ixToP = -1;
        int ixToL = 0;
        while (!l->isLeaf()) {
            ixToP = ixToL;
            ixToL = l->getChildIndex(key, cmp);
            gp = p;
            p = l;
            l = rqProvider->read_addr(tid, &l->ptrs[ixToL]);
        }

        /**
         * do the update
         */
        int keyIndex = l->getKeyIndex(key, cmp);
        if (keyIndex < l->getKeyCount() && l->keys[keyIndex] == key) {
            /**
             * if l already contains key, replace the existing value
             */
            void* const oldValue = l->ptrs[keyIndex]; // this is a value, not a pointer, so it cannot be modified by rqProvider->linearize_update_at_..., so we do not use read_addr
            if (!replace) {
                this->recordmgr->enterQuiescentState(tid);
                return oldValue;
            }
            
            // perform LLXs
            if (!llx(tid, p, NULL, 0, info->scxPtrs, info->nodes)
                     || rqProvider->read_addr(tid, &p->ptrs[ixToL]) != l) {
                this->recordmgr->enterQuiescentState(tid);
                continue;    // retry the search
            }
            info->nodes[1] = l;
            
            // create new node(s)
            Node<DEGREE,K>* n = allocateNode(tid);
            arraycopy(l->keys, 0, n->keys, 0, l->getKeyCount());
            arraycopy(l->ptrs, 0, n->ptrs, 0, l->getABDegree());    // although we are copying l->ptrs, since l is a leaf, l->ptrs CANNOT contain modified by rqProvider->linearize_update_at_..., so we do not use arraycopy_ptrs.
            n->ptrs[keyIndex] = (Node<DEGREE,K>*) value;            // similarly, we don't use write_addr here
            n->leaf = true;
            n->marked = false;
            n->scxPtr = DUMMY;
            n->searchKey = l->searchKey;
            n->size = l->size;
            n->weight = true;
            
            // construct info record to pass to SCX
            info->numberOfNodes = 2;
            info->numberOfNodesAllocated = 1;
            info->numberOfNodesToFreeze = 1;
            info->field = &p->ptrs[ixToL];
            info->newNode = n;
            info->insertedNodes[0] = n;
            info->insertedNodes[1] = NULL;
            info->deletedNodes[0] = l;
            info->deletedNodes[1] = NULL;

            if (scx(tid, info)) {
                TRACE COUTATOMICTID("replace pair ("<<key<<", "<<value<<"): SCX succeeded"<<endl);
                fixDegreeViolation(tid, n);
                this->recordmgr->enterQuiescentState(tid);
                return oldValue;
            }
            TRACE COUTATOMICTID("replace pair ("<<key<<", "<<value<<"): SCX FAILED"<<endl);
            this->recordmgr->enterQuiescentState(tid);
            this->recordmgr->deallocate(tid, n);

        } else {
            /**
             * if l does not contain key, we have to insert it
             */

            // perform LLXs
            if (!llx(tid, p, NULL, 0, info->scxPtrs, info->nodes) || rqProvider->read_addr(tid, &p->ptrs[ixToL]) != l) {
                this->recordmgr->enterQuiescentState(tid);
                continue;    // retry the search
            }
            info->nodes[1] = l;
            
            if (l->getKeyCount() < b) {
                /**
                 * Insert pair
                 */
                
                // create new node(s)
                Node<DEGREE,K>* n = allocateNode(tid);
                arraycopy(l->keys, 0, n->keys, 0, keyIndex);
                arraycopy(l->keys, keyIndex, n->keys, keyIndex+1, l->getKeyCount()-keyIndex);
                n->keys[keyIndex] = key;
                arraycopy(l->ptrs, 0, n->ptrs, 0, keyIndex); // although we are copying the ptrs array, since the source node is a leaf, ptrs CANNOT contain modified by rqProvider->linearize_update_at_..., so we do not use arraycopy_ptrs.
                arraycopy(l->ptrs, keyIndex, n->ptrs, keyIndex+1, l->getABDegree()-keyIndex);
                n->ptrs[keyIndex] = (Node<DEGREE,K>*) value; // similarly, we don't use write_addr here
                n->leaf = l->leaf;
                n->marked = false;
                n->scxPtr = DUMMY;
                n->searchKey = l->searchKey;
                n->size = l->size+1;
                n->weight = l->weight;

                // construct info record to pass to SCX
                info->numberOfNodes = 2;
                info->numberOfNodesAllocated = 1;
                info->numberOfNodesToFreeze = 1;
                info->field = &p->ptrs[ixToL];
                info->newNode = n;
                info->insertedNodes[0] = n;
                info->insertedNodes[1] = NULL;
                info->deletedNodes[0] = l;
                info->deletedNodes[1] = NULL;
                
                if (scx(tid, info)) {
                    TRACE COUTATOMICTID("insert pair ("<<key<<", "<<value<<"): SCX succeeded"<<endl);
                    fixDegreeViolation(tid, n);
                    this->recordmgr->enterQuiescentState(tid);
                    return NO_VALUE;
                }
                TRACE COUTATOMICTID("insert pair ("<<key<<", "<<value<<"): SCX FAILED"<<endl);
                this->recordmgr->enterQuiescentState(tid);
                this->recordmgr->deallocate(tid, n);
                
            } else { // assert: l->getKeyCount() == DEGREE == b)
                /**
                 * Overflow
                 */
                
                // first, we create a pair of large arrays
                // containing too many keys and pointers to fit in a single node
                K keys[DEGREE+1];
                Node<DEGREE,K>* ptrs[DEGREE+1];
                arraycopy(l->keys, 0, keys, 0, keyIndex);
                arraycopy(l->keys, keyIndex, keys, keyIndex+1, l->getKeyCount()-keyIndex);
                keys[keyIndex] = key;
                arraycopy(l->ptrs, 0, ptrs, 0, keyIndex); // although we are copying the ptrs array, since the source node is a leaf, ptrs CANNOT contain modified by rqProvider->linearize_update_at_..., so we do not use arraycopy_ptrs.
                arraycopy(l->ptrs, keyIndex, ptrs, keyIndex+1, l->getABDegree()-keyIndex);
                ptrs[keyIndex] = (Node<DEGREE,K>*) value;

                // create new node(s):
                // since the new arrays are too big to fit in a single node,
                // we replace l by a new subtree containing three new nodes:
                // a parent, and two leaves;
                // the array contents are then split between the two new leaves

                const int size1 = (DEGREE+1)/2;
                Node<DEGREE,K>* left = allocateNode(tid);
                arraycopy(keys, 0, left->keys, 0, size1);
                arraycopy(ptrs, 0, left->ptrs, 0, size1); // although we are copying the ptrs array, since the node is a leaf, ptrs CANNOT contain modified by rqProvider->linearize_update_at_..., so we do not use arraycopy_ptrs.
                left->leaf = true;
                left->marked = false;
                left->scxPtr = DUMMY;
                left->searchKey = keys[0];
                left->size = size1;
                left->weight = true;

                const int size2 = (DEGREE+1) - size1;
                Node<DEGREE,K>* right = allocateNode(tid);
                arraycopy(keys, size1, right->keys, 0, size2);
                arraycopy(ptrs, size1, right->ptrs, 0, size2); // although we are copying the ptrs array, since the node is a leaf, ptrs CANNOT contain modified by rqProvider->linearize_update_at_..., so we do not use arraycopy_ptrs.
                right->leaf = true;
                right->marked = false;
                right->scxPtr = DUMMY;
                right->searchKey = keys[size1];
                right->size = size2;
                right->weight = true;
                
                Node<DEGREE,K>* n = allocateNode(tid);
                n->keys[0] = keys[size1];
                rqProvider->write_addr(tid, &n->ptrs[0], left);
                rqProvider->write_addr(tid, &n->ptrs[1], right);
                n->leaf = false;
                n->marked = false;
                n->scxPtr = DUMMY;
                n->searchKey = keys[size1];
                n->size = 2;
                n->weight = p == entry;
                
                // note: weight of new internal node n will be zero,
                //       unless it is the root; this is because we test
                //       p == entry, above; in doing this, we are actually
                //       performing Root-Zero at the same time as this Overflow
                //       if n will become the root (of the B-slack tree)
                
                // construct info record to pass to SCX
                info->numberOfNodes = 2;
                info->numberOfNodesAllocated = 3;
                info->numberOfNodesToFreeze = 1;
                info->field = &p->ptrs[ixToL];
                info->newNode = n;
                info->insertedNodes[0] = n;
                info->insertedNodes[1] = left;
                info->insertedNodes[2] = right;
                info->insertedNodes[3] = NULL;
                info->deletedNodes[0] = l;
                info->deletedNodes[1] = NULL;

                if (scx(tid, info)) {
                    TRACE COUTATOMICTID("insert overflow ("<<key<<", "<<value<<"): SCX succeeded"<<endl);

                    // after overflow, there may be a weight violation at n,
                    // and there may be a slack violation at p
                    fixWeightViolation(tid, n);
                    this->recordmgr->enterQuiescentState(tid);
                    return NO_VALUE;
                }
                TRACE COUTATOMICTID("insert overflow ("<<key<<", "<<value<<"): SCX FAILED"<<endl);
                this->recordmgr->enterQuiescentState(tid);
                this->recordmgr->deallocate(tid, n);
                this->recordmgr->deallocate(tid, left);
                this->recordmgr->deallocate(tid, right);
            }
        }
    }
}

template <int DEGREE, typename K, class Compare, class RecManager>
const pair<void*,bool> abtree_ns::abtree<DEGREE,K,Compare,RecManager>::erase(const int tid, const K& key) {
    wrapper_info<DEGREE,K> _info;
    wrapper_info<DEGREE,K>* info = &_info;
    while (true) {
        /**
         * search
         */
        this->recordmgr->leaveQuiescentState(tid);
        Node<DEGREE,K>* gp = NULL;
        Node<DEGREE,K>* p = entry;
        Node<DEGREE,K>* l = rqProvider->read_addr(tid, &p->ptrs[0]);
        int ixToP = -1;
        int ixToL = 0;
        while (!l->isLeaf()) {
            ixToP = ixToL;
            ixToL = l->getChildIndex(key, cmp);
            gp = p;
            p = l;
            l = rqProvider->read_addr(tid, &l->ptrs[ixToL]);
        }

        /**
         * do the update
         */
        const int keyIndex = l->getKeyIndex(key, cmp);
        if (keyIndex == l->getKeyCount() || l->keys[keyIndex] != key) {
            /**
             * if l does not contain key, we are done.
             */
            this->recordmgr->enterQuiescentState(tid);
            return pair<void*,bool>(NO_VALUE,false);
        } else {
            /**
             * if l contains key, replace l by a new copy that does not contain key.
             */

            // perform LLXs
            if (!llx(tid, p, NULL, 0, info->scxPtrs, info->nodes) || rqProvider->read_addr(tid, &p->ptrs[ixToL]) != l) {
                this->recordmgr->enterQuiescentState(tid);
                continue;    // retry the search
            }
            info->nodes[1] = l;
            // create new node(s)
            Node<DEGREE,K>* n = allocateNode(tid);
            //printf("keyIndex=%d getABDegree-keyIndex=%d\n", keyIndex, l->getABDegree()-keyIndex);
            arraycopy(l->keys, 0, n->keys, 0, keyIndex);
            arraycopy(l->keys, keyIndex+1, n->keys, keyIndex, l->getKeyCount()-(keyIndex+1));
            arraycopy(l->ptrs, 0, n->ptrs, 0, keyIndex); // although we are copying the ptrs array, since the node is a leaf, ptrs CANNOT contain modified by rqProvider->linearize_update_at_..., so we do not use arraycopy_ptrs.
            arraycopy(l->ptrs, keyIndex+1, n->ptrs, keyIndex, l->getABDegree()-(keyIndex+1));
            n->leaf = true;
            n->marked = false;
            n->scxPtr = DUMMY;
            n->searchKey = l->keys[0]; // NOTE: WE MIGHT BE DELETING l->keys[0], IN WHICH CASE newL IS EMPTY. HOWEVER, newL CAN STILL BE LOCATED BY SEARCHING FOR l->keys[0], SO WE USE THAT AS THE searchKey FOR newL.
            n->size = l->size-1;
            n->weight = true;

            // construct info record to pass to SCX
            info->numberOfNodes = 2;
            info->numberOfNodesAllocated = 1;
            info->numberOfNodesToFreeze = 1;
            info->field = &p->ptrs[ixToL];
            info->newNode = n;
            info->insertedNodes[0] = n;
            info->insertedNodes[1] = NULL;
            info->deletedNodes[0] = l;
            info->deletedNodes[1] = NULL;

            void* oldValue = l->ptrs[keyIndex]; // since the node is a leaf, ptrs is not modified by any call to rqProvider->linearize_update_at_..., so we do not need to use read_addr to access it
            if (scx(tid, info)) {
                TRACE COUTATOMICTID("delete pair ("<<key<<", "<<oldValue<<"): SCX succeeded"<<endl);

                /**
                 * Compress may be needed at p after removing key from l.
                 */
                fixDegreeViolation(tid, n);
                this->recordmgr->enterQuiescentState(tid);
                return pair<void*,bool>(oldValue, true);
            }
            TRACE COUTATOMICTID("delete pair ("<<key<<", "<<oldValue<<"): SCX FAILED"<<endl);
            this->recordmgr->enterQuiescentState(tid);
            this->recordmgr->deallocate(tid, n);
        }
    }
}

/**
 * 
 * 
 * IMPLEMENTATION OF REBALANCING
 * 
 * 
 */

template <int DEGREE, typename K, class Compare, class RecManager>
bool abtree_ns::abtree<DEGREE,K,Compare,RecManager>::fixWeightViolation(const int tid, Node<DEGREE,K>* viol) {
    if (viol->weight) return false;

    // assert: viol is internal (because leaves always have weight = 1)
    // assert: viol is not entry or root (because both always have weight = 1)

    // do an optimistic check to see if viol was already removed from the tree
    if (llx(tid, viol, NULL) == FINALIZED) {
        // recall that nodes are finalized precisely when
        // they are removed from the tree
        // we hand off responsibility for any violations at viol to the
        // process that removed it.
        return false;
    }

    wrapper_info<DEGREE,K> _info;
    wrapper_info<DEGREE,K>* info = &_info;

    // try to locate viol, and fix any weight violation at viol
    while (true) {

        const K k = viol->searchKey;
        Node<DEGREE,K>* gp = NULL;
        Node<DEGREE,K>* p = entry;
        Node<DEGREE,K>* l = rqProvider->read_addr(tid, &p->ptrs[0]);
        int ixToP = -1;
        int ixToL = 0;
        while (!l->isLeaf() && l != viol) {
            ixToP = ixToL;
            ixToL = l->getChildIndex(k, cmp);
            gp = p;
            p = l;
            l = rqProvider->read_addr(tid, &l->ptrs[ixToL]);
        }

        if (l != viol) {
            // l was replaced by another update.
            // we hand over responsibility for viol to that update.
            return false;
        }

        // we cannot apply this update if p has a weight violation
        // so, we check if this is the case, and, if so, try to fix it
        if (!p->weight) {
            fixWeightViolation(tid, p);
            continue;
        }
        
        // perform LLXs
        if (!llx(tid, gp, NULL, 0, info->scxPtrs, info->nodes) || rqProvider->read_addr(tid, &gp->ptrs[ixToP]) != p) continue;    // retry the search
        if (!llx(tid, p, NULL, 1, info->scxPtrs, info->nodes) || rqProvider->read_addr(tid, &p->ptrs[ixToL]) != l) continue;      // retry the search
        if (!llx(tid, l, NULL, 2, info->scxPtrs, info->nodes)) continue;                             // retry the search

        const int c = p->getABDegree() + l->getABDegree();
        const int size = c-1;

        if (size <= b) {
            /**
             * Absorb
             */

            // create new node(s)
            // the new arrays are small enough to fit in a single node,
            // so we replace p by a new internal node.
            Node<DEGREE,K>* n = allocateNode(tid);
            arraycopy_ptrs(p->ptrs, 0, n->ptrs, 0, ixToL); // p and l are both internal, so we use arraycopy_ptrs
            arraycopy_ptrs(l->ptrs, 0, n->ptrs, ixToL, l->getABDegree());
            arraycopy_ptrs(p->ptrs, ixToL+1, n->ptrs, ixToL+l->getABDegree(), p->getABDegree()-(ixToL+1));
            arraycopy(p->keys, 0, n->keys, 0, ixToL);
            arraycopy(l->keys, 0, n->keys, ixToL, l->getKeyCount());
            arraycopy(p->keys, ixToL, n->keys, ixToL+l->getKeyCount(), p->getKeyCount()-ixToL);
            n->leaf = false; assert(!l->isLeaf());
            n->marked = false;
            n->scxPtr = DUMMY;
            n->searchKey = n->keys[0];
            n->size = size;
            n->weight = true;
            
            // construct info record to pass to SCX
            info->numberOfNodes = 3;
            info->numberOfNodesAllocated = 1;
            info->numberOfNodesToFreeze = 3;
            info->field = &gp->ptrs[ixToP];
            info->newNode = n;
//            info->insertedNodes[0] = info->deletedNodes[0] = NULL;
            info->insertedNodes[0] = n;
            info->insertedNodes[1] = NULL;
            info->deletedNodes[0] = p;
            info->deletedNodes[1] = l;
            info->deletedNodes[2] = NULL;
            
            if (scx(tid, info)) {
                TRACE COUTATOMICTID("absorb: SCX succeeded"<<endl);

                //    absorb [check: slack@n]
                //        no weight at pi(u)
                //        degree at pi(u) -> eliminated
                //        slack at pi(u) -> eliminated or slack at n
                //        weight at u -> eliminated
                //        no degree at u
                //        slack at u -> slack at n

                /**
                 * Compress may be needed at the new internal node we created
                 * (since we move grandchildren from two parents together).
                 */
                fixDegreeViolation(tid, n);
                return true;
            }
            TRACE COUTATOMICTID("absorb: SCX FAILED"<<endl);
            this->recordmgr->deallocate(tid, n);

        } else {
            /**
             * Split
             */

            // merge keys of p and l into one big array (and similarly for children)
            // (we essentially replace the pointer to l with the contents of l)
            K keys[2*DEGREE];
            Node<DEGREE,K>* ptrs[2*DEGREE];
            arraycopy_ptrs(p->ptrs, 0, ptrs, 0, ixToL); // p and l are both internal, so we use arraycopy_ptrs
            arraycopy_ptrs(l->ptrs, 0, ptrs, ixToL, l->getABDegree());
            arraycopy_ptrs(p->ptrs, ixToL+1, ptrs, ixToL+l->getABDegree(), p->getABDegree()-(ixToL+1));
            arraycopy(p->keys, 0, keys, 0, ixToL);
            arraycopy(l->keys, 0, keys, ixToL, l->getKeyCount());
            arraycopy(p->keys, ixToL, keys, ixToL+l->getKeyCount(), p->getKeyCount()-ixToL);

            // the new arrays are too big to fit in a single node,
            // so we replace p by a new internal node and two new children.
            //
            // we take the big merged array and split it into two arrays,
            // which are used to create two new children u and v.
            // we then create a new internal node (whose weight will be zero
            // if it is not the root), with u and v as its children.
            
            // create new node(s)
            const int size1 = size / 2;
            Node<DEGREE,K>* left = allocateNode(tid);
            arraycopy(keys, 0, left->keys, 0, size1-1);
            arraycopy_ptrs(ptrs, 0, left->ptrs, 0, size1);
            left->leaf = false; assert(!l->isLeaf());
            left->marked = false;
            left->scxPtr = DUMMY;
            left->searchKey = keys[0];
            left->size = size1;
            left->weight = true;

            const int size2 = size - size1;
            Node<DEGREE,K>* right = allocateNode(tid);
            arraycopy(keys, size1, right->keys, 0, size2-1);
            arraycopy_ptrs(ptrs, size1, right->ptrs, 0, size2);
            right->leaf = false;
            right->marked = false;
            right->scxPtr = DUMMY;
            right->searchKey = keys[size1];
            right->size = size2;
            right->weight = true;

            Node<DEGREE,K>* n = allocateNode(tid);
            n->keys[0] = keys[size1-1];
            rqProvider->write_addr(tid, &n->ptrs[0], left);
            rqProvider->write_addr(tid, &n->ptrs[1], right);
            n->leaf = false;
            n->marked = false;
            n->scxPtr = DUMMY;
            n->searchKey = keys[size1-1]; // note: should be the same as n->keys[0]
            n->size = 2;
            n->weight = (gp == entry);

            // note: weight of new internal node n will be zero,
            //       unless it is the root; this is because we test
            //       gp == entry, above; in doing this, we are actually
            //       performing Root-Zero at the same time as this Overflow
            //       if n will become the root (of the B-slack tree)

            // construct info record to pass to SCX
            info->numberOfNodes = 3;
            info->numberOfNodesAllocated = 3;
            info->numberOfNodesToFreeze = 3;
            info->field = &gp->ptrs[ixToP];
            info->newNode = n;
//            info->insertedNodes[0] = info->deletedNodes[0] = NULL;
            info->insertedNodes[0] = n;
            info->insertedNodes[1] = left;
            info->insertedNodes[2] = right;
            info->insertedNodes[3] = NULL;
            info->deletedNodes[0] = p;
            info->deletedNodes[1] = l;
            info->deletedNodes[2] = NULL;

            if (scx(tid, info)) {
                TRACE COUTATOMICTID("split: SCX succeeded"<<endl);

                fixWeightViolation(tid, n);
                fixDegreeViolation(tid, n);
                return true;
            }
            TRACE COUTATOMICTID("split: SCX FAILED"<<endl);
            this->recordmgr->deallocate(tid, n);
            this->recordmgr->deallocate(tid, left);
            this->recordmgr->deallocate(tid, right);
        }
    }
}

template <int DEGREE, typename K, class Compare, class RecManager>
bool abtree_ns::abtree<DEGREE,K,Compare,RecManager>::fixDegreeViolation(const int tid, Node<DEGREE,K>* viol) {
    if (viol->getABDegree() >= a || viol == entry || viol == rqProvider->read_addr(tid, &entry->ptrs[0])) {
        return false; // no degree violation at viol
    }
    
    // do an optimistic check to see if viol was already removed from the tree
    if (llx(tid, viol, NULL) == FINALIZED) {
        // recall that nodes are finalized precisely when
        // they are removed from the tree.
        // we hand off responsibility for any violations at viol to the
        // process that removed it.
        return false;
    }

    wrapper_info<DEGREE,K> _info;
    wrapper_info<DEGREE,K>* info = &_info;

    // we search for viol and try to fix any violation we find there
    // this entails performing AbsorbSibling or Distribute.
    while (true) {
        /**
         * search for viol
         */
        const K k = viol->searchKey;
        Node<DEGREE,K>* gp = NULL;
        Node<DEGREE,K>* p = entry;
        Node<DEGREE,K>* l = rqProvider->read_addr(tid, &p->ptrs[0]);
        int ixToP = -1;
        int ixToL = 0;
        while (!l->isLeaf() && l != viol) {
            ixToP = ixToL;
            ixToL = l->getChildIndex(k, cmp);
            gp = p;
            p = l;
            l = rqProvider->read_addr(tid, &l->ptrs[ixToL]);
        }

        if (l != viol) {
            // l was replaced by another update.
            // we hand over responsibility for viol to that update.
            return false;
        }
        
        // assert: gp != NULL (because if AbsorbSibling or Distribute can be applied, then p is not the root)
        
        // perform LLXs
        if (!llx(tid, gp, NULL, 0, info->scxPtrs, info->nodes)
                 || rqProvider->read_addr(tid, &gp->ptrs[ixToP]) != p) continue;   // retry the search
        if (!llx(tid, p, NULL, 1, info->scxPtrs, info->nodes) 
                 || rqProvider->read_addr(tid, &p->ptrs[ixToL]) != l) continue;     // retry the search

        int ixToS = (ixToL > 0 ? ixToL-1 : 1);
        Node<DEGREE,K>* s = rqProvider->read_addr(tid, &p->ptrs[ixToS]);
        
        // we can only apply AbsorbSibling or Distribute if there are no
        // weight violations at p, l or s.
        // so, we first check for any weight violations,
        // and fix any that we see.
        bool foundWeightViolation = false;
        if (!p->weight) {
            foundWeightViolation = true;
            fixWeightViolation(tid, p);
        }
        if (!l->weight) {
            foundWeightViolation = true;
            fixWeightViolation(tid, l);
        }
        if (!s->weight) {
            foundWeightViolation = true;
            fixWeightViolation(tid, s);
        }
        // if we see any weight violations, then either we fixed one,
        // removing one of these nodes from the tree,
        // or one of the nodes has been removed from the tree by another
        // rebalancing step, so we retry the search for viol
        if (foundWeightViolation) continue;

        // assert: there are no weight violations at p, l or s
        // assert: l and s are either both leaves or both internal nodes
        //         (because there are no weight violations at these nodes)
        
        // also note that p->size >= a >= 2
        
        Node<DEGREE,K>* left;
        Node<DEGREE,K>* right;
        int leftindex;
        int rightindex;

        if (ixToL < ixToS) {
            if (!llx(tid, l, NULL, 2, info->scxPtrs, info->nodes)) continue; // retry the search
            if (!llx(tid, s, NULL, 3, info->scxPtrs, info->nodes)) continue; // retry the search
            left = l;
            right = s;
            leftindex = ixToL;
            rightindex = ixToS;
        } else {
            if (!llx(tid, s, NULL, 2, info->scxPtrs, info->nodes)) continue; // retry the search
            if (!llx(tid, l, NULL, 3, info->scxPtrs, info->nodes)) continue; // retry the search
            left = s;
            right = l;
            leftindex = ixToS;
            rightindex = ixToL;
        }
        
        int sz = left->getABDegree() + right->getABDegree();
        assert(left->weight && right->weight);
        
        if (sz < 2*a) {
            /**
             * AbsorbSibling
             */
            
            // create new node(s))
            Node<DEGREE,K>* newl = allocateNode(tid);
            int k1=0, k2=0;
            for (int i=0;i<left->getKeyCount();++i) {
                newl->keys[k1++] = left->keys[i];
            }
            for (int i=0;i<left->getABDegree();++i) {
                if (left->isLeaf()) {
                    newl->ptrs[k2++] = left->ptrs[i];
                } else {
                    //assert(left->getKeyCount() != left->getABDegree());
                    rqProvider->write_addr(tid, &newl->ptrs[k2++], rqProvider->read_addr(tid, &left->ptrs[i]));
                }
            }
            if (!left->isLeaf()) newl->keys[k1++] = p->keys[leftindex];
            for (int i=0;i<right->getKeyCount();++i) {
                newl->keys[k1++] = right->keys[i];
            }
            for (int i=0;i<right->getABDegree();++i) {
                if (right->isLeaf()) {
                    newl->ptrs[k2++] = right->ptrs[i];
                } else {
                    rqProvider->write_addr(tid, &newl->ptrs[k2++], rqProvider->read_addr(tid, &right->ptrs[i]));
                }
            }
            newl->leaf = left->isLeaf();
            newl->marked = false;
            newl->scxPtr = DUMMY;
            newl->searchKey = l->searchKey;
            newl->size = l->getABDegree() + s->getABDegree();
            newl->weight = true; assert(left->weight && right->weight && p->weight);
            
            // now, we atomically replace p and its children with the new nodes.
            // if appropriate, we perform RootAbsorb at the same time.
            if (gp == entry && p->getABDegree() == 2) {
            
                // construct info record to pass to SCX
                info->numberOfNodes = 4; // gp + p + l + s
                info->numberOfNodesAllocated = 1; // newl
                info->numberOfNodesToFreeze = 4; // gp + p + l + s
                info->field = &gp->ptrs[ixToP];
                info->newNode = newl;
                info->insertedNodes[0] = newl;
                info->insertedNodes[1] = NULL;
                info->deletedNodes[0] = p;
                info->deletedNodes[1] = l;
                info->deletedNodes[2] = s;
                info->deletedNodes[3] = NULL;
                
                if (scx(tid, info)) {
                    TRACE COUTATOMICTID("absorbsibling AND rootabsorb: SCX succeeded"<<endl);

                    fixDegreeViolation(tid, newl);
                    return true;
                }
                TRACE COUTATOMICTID("absorbsibling AND rootabsorb: SCX FAILED"<<endl);
                this->recordmgr->deallocate(tid, newl);
                
            } else {
                assert(gp != entry || p->getABDegree() > 2);
                
                // create n from p by:
                // 1. skipping the key for leftindex and child pointer for ixToS
                // 2. replacing l with newl
                Node<DEGREE,K>* n = allocateNode(tid);
                for (int i=0;i<leftindex;++i) {
                    n->keys[i] = p->keys[i];
                }
                for (int i=0;i<ixToS;++i) {
                    rqProvider->write_addr(tid, &n->ptrs[i], rqProvider->read_addr(tid, &p->ptrs[i]));      // n and p are internal, so their ptrs arrays might have entries that are being modified by rqProvider->linearize_update_at_..., so we use read_addr and write_addr
                }
                for (int i=leftindex+1;i<p->getKeyCount();++i) {
                    n->keys[i-1] = p->keys[i];
                }
                for (int i=ixToL+1;i<p->getABDegree();++i) {
                    rqProvider->write_addr(tid, &n->ptrs[i-1], rqProvider->read_addr(tid, &p->ptrs[i]));    // n and p are internal, so their ptrs arrays might have entries that are being modified by rqProvider->linearize_update_at_..., so we use read_addr and write_addr
                }
                // replace l with newl
                rqProvider->write_addr(tid, &n->ptrs[ixToL - (ixToL > ixToS)], newl);
                n->leaf = false;
                n->marked = false;
                n->scxPtr = DUMMY;
                n->searchKey = p->searchKey;
                n->size = p->getABDegree()-1;
                n->weight = true;

                // construct info record to pass to SCX
                info->numberOfNodes = 4; // gp + p + l + s
                info->numberOfNodesAllocated = 2; // n + newl
                info->numberOfNodesToFreeze = 4; // gp + p + l + s
                info->field = &gp->ptrs[ixToP];
                info->newNode = n;
                info->insertedNodes[0] = n;
                info->insertedNodes[1] = newl;
                info->insertedNodes[2] = NULL;
                info->deletedNodes[0] = p;
                info->deletedNodes[1] = l;
                info->deletedNodes[2] = s;
                info->deletedNodes[3] = NULL;
                
                if (scx(tid, info)) {
                    TRACE COUTATOMICTID("absorbsibling: SCX succeeded"<<endl);

                    fixDegreeViolation(tid, newl);
                    fixDegreeViolation(tid, n);
                    return true;
                }
                TRACE COUTATOMICTID("absorbsibling: SCX FAILED"<<endl);
                this->recordmgr->deallocate(tid, newl);
                this->recordmgr->deallocate(tid, n);
            }
            
        } else {
            /**
             * Distribute
             */
            
            int leftsz = sz/2;
            int rightsz = sz-leftsz;
            
            // create new node(s))
            Node<DEGREE,K>* n = allocateNode(tid);
            Node<DEGREE,K>* newleft = allocateNode(tid);
            Node<DEGREE,K>* newright = allocateNode(tid);
            
            // combine the contents of l and s (and one key from p if l and s are internal)
            K keys[2*DEGREE];
            Node<DEGREE,K>* ptrs[2*DEGREE];
            int k1=0, k2=0;
            for (int i=0;i<left->getKeyCount();++i) {
                keys[k1++] = left->keys[i];
            }
            for (int i=0;i<left->getABDegree();++i) {
                if (left->isLeaf()) {
                    ptrs[k2++] = left->ptrs[i];
                } else {
                    ptrs[k2++] = rqProvider->read_addr(tid, &left->ptrs[i]);
                }
            }
            if (!left->isLeaf()) keys[k1++] = p->keys[leftindex];
            for (int i=0;i<right->getKeyCount();++i) {
                keys[k1++] = right->keys[i];
            }
            for (int i=0;i<right->getABDegree();++i) {
                if (right->isLeaf()) {
                    ptrs[k2++] = right->ptrs[i];
                } else {
                    ptrs[k2++] = rqProvider->read_addr(tid, &right->ptrs[i]);
                }
            }
            
            // distribute contents between newleft and newright
            k1=0;
            k2=0;
            for (int i=0;i<leftsz - !left->isLeaf();++i) {
                newleft->keys[i] = keys[k1++];
            }
            for (int i=0;i<leftsz;++i) {
                if (left->isLeaf()) {
                    newleft->ptrs[i] = ptrs[k2++];
                } else {
                    rqProvider->write_addr(tid, &newleft->ptrs[i], ptrs[k2++]);
                }
            }
            newleft->leaf = left->isLeaf();
            newleft->marked = false;
            newleft->scxPtr = DUMMY;
            newleft->searchKey = newleft->keys[0];
            newleft->size = leftsz;
            newleft->weight = true;
            
            // reserve one key for the parent (to go between newleft and newright)
            K keyp = keys[k1];
            if (!left->isLeaf()) ++k1;
            for (int i=0;i<rightsz - !left->isLeaf();++i) {
                newright->keys[i] = keys[k1++];
            }
            for (int i=0;i<rightsz;++i) {
                if (right->isLeaf()) {
                    newright->ptrs[i] = ptrs[k2++];
                } else {
                    rqProvider->write_addr(tid, &newright->ptrs[i], ptrs[k2++]);
                }
            }
            newright->leaf = right->isLeaf();
            newright->marked = false;
            newright->scxPtr = DUMMY;
            newright->searchKey = newright->keys[0];
            newright->size = rightsz;
            newright->weight = true;
            
            // create n from p by replacing left with newleft and right with newright,
            // and replacing one key (between these two pointers)
            for (int i=0;i<p->getKeyCount();++i) {
                n->keys[i] = p->keys[i];
            }
            for (int i=0;i<p->getABDegree();++i) {
                rqProvider->write_addr(tid, &n->ptrs[i], rqProvider->read_addr(tid, &p->ptrs[i])); // n and p are internal, so their ptrs arrays might have entries that are being modified by rqProvider->linearize_update_at_..., so we use read_addr and write_addr
            }
            n->keys[leftindex] = keyp;
            rqProvider->write_addr(tid, &n->ptrs[leftindex], newleft);
            rqProvider->write_addr(tid, &n->ptrs[rightindex], newright);
            n->leaf = false;
            n->marked = false;
            n->scxPtr = DUMMY;
            n->searchKey = p->searchKey;
            n->size = p->size;
            n->weight = true;
            
            // construct info record to pass to SCX
            info->numberOfNodes = 4; // gp + p + l + s
            info->numberOfNodesAllocated = 3; // n + newleft + newright
            info->numberOfNodesToFreeze = 4; // gp + p + l + s
            info->field = &gp->ptrs[ixToP];
            info->newNode = n;
            info->insertedNodes[0] = n;
            info->insertedNodes[1] = newleft;
            info->insertedNodes[2] = newright;
            info->insertedNodes[3] = NULL;
            info->deletedNodes[0] = p;
            info->deletedNodes[1] = l;
            info->deletedNodes[2] = s;
            info->deletedNodes[3] = NULL;
            
            if (scx(tid, info)) {
                TRACE COUTATOMICTID("distribute: SCX succeeded"<<endl);

                fixDegreeViolation(tid, n);
                return true;
            }
            TRACE COUTATOMICTID("distribute: SCX FAILED"<<endl);
            this->recordmgr->deallocate(tid, n);
            this->recordmgr->deallocate(tid, newleft);
            this->recordmgr->deallocate(tid, newright);
        }
    }
}

/**
 * 
 * IMPLEMENTATION OF LLX AND SCX
 * 
 * 
 */

template <int DEGREE, typename K, class Compare, class RecManager>
bool abtree_ns::abtree<DEGREE,K,Compare,RecManager>::llx(const int tid, Node<DEGREE,K>* r, Node<DEGREE,K> ** snapshot, const int i, SCXRecord<DEGREE,K> ** ops, Node<DEGREE,K> ** nodes) {
    SCXRecord<DEGREE,K>* result = llx(tid, r, snapshot);
    if (result == FAILED || result == FINALIZED) return false;
    ops[i] = result;
    nodes[i] = r;
    return true;
}

template <int DEGREE, typename K, class Compare, class RecManager>
abtree_ns::SCXRecord<DEGREE,K>* abtree_ns::abtree<DEGREE,K,Compare,RecManager>::llx(const int tid, Node<DEGREE,K>* r, Node<DEGREE,K> ** snapshot) {
    const bool marked = r->marked;
    SOFTWARE_BARRIER;
    tagptr_t tagptr = (tagptr_t) r->scxPtr;
    
    // read mutable state field of descriptor
    bool succ;
    TRACE COUTATOMICTID("tagged ptr seq="<<UNPACK1_SEQ(tagptr)<<" descriptor seq="<<UNPACK1_SEQ(TAGPTR1_UNPACK_PTR(tagptr)->c.mutables)<<endl);
    int state = DESC1_READ_FIELD(succ, TAGPTR1_UNPACK_PTR(tagptr)->c.mutables, tagptr, MUTABLES1_MASK_STATE, MUTABLES1_OFFSET_STATE);
    if (!succ) state = SCXRecord<DEGREE,K>::STATE_COMMITTED;
    TRACE { mutables_t debugmutables = TAGPTR1_UNPACK_PTR(tagptr)->c.mutables; COUTATOMICTID("llx scxrecord succ="<<succ<<" state="<<state<<" mutables="<<debugmutables<<" desc-seq="<<UNPACK1_SEQ(debugmutables)<<endl); }
    // note: special treatment for alg in the case where the descriptor has already been reallocated (impossible before the transformation, assuming safe memory reclamation)
    SOFTWARE_BARRIER;
    
    if (state == SCXRecord<DEGREE,K>::STATE_ABORTED || ((state == SCXRecord<DEGREE,K>::STATE_COMMITTED) && !r->marked)) {
        // read snapshot fields
        if (snapshot != NULL) {
            if (r->isLeaf()) {
                arraycopy(r->ptrs, 0, snapshot, 0, r->getABDegree());
            } else {
                arraycopy_ptrs(r->ptrs, 0, snapshot, 0, r->getABDegree());
            }
        }
        if ((tagptr_t) r->scxPtr == tagptr) return (SCXRecord<DEGREE,K> *) tagptr; // we have a snapshot
    }

    if (state == SCXRecord<DEGREE,K>::STATE_INPROGRESS) {
        helpOther(tid, tagptr);
    }
    return (marked ? FINALIZED : FAILED);
}

template<int DEGREE, typename K, class Compare, class RecManager>
bool abtree_ns::abtree<DEGREE,K,Compare,RecManager>::scx(const int tid, wrapper_info<DEGREE,K> * info) {
    const int init_state = SCXRecord<DEGREE,K>::STATE_INPROGRESS;
    SCXRecord<DEGREE,K> * newdesc = createSCXRecord(tid, info);
    tagptr_t tagptr = TAGPTR1_NEW(tid, newdesc->c.mutables);
    info->state = help(tid, tagptr, newdesc, false);
    return info->state & SCXRecord<DEGREE,K>::STATE_COMMITTED;
}

// returns true if we executed help, and false otherwise
template<int DEGREE, typename K, class Compare, class RecManager>
void abtree_ns::abtree<DEGREE,K,Compare,RecManager>::helpOther(const int tid, tagptr_t tagptr) {
    if ((void*) tagptr == DUMMY) {
        return; // deal with the dummy descriptor
    }
    SCXRecord<DEGREE,K> snap;
    if (DESC1_SNAPSHOT(&snap, tagptr, SCXRecord<DEGREE comma K>::size)) {
        help(tid, tagptr, &snap, true);
    }
}

template<int DEGREE, typename K, class Compare, class RecManager>
int abtree_ns::abtree<DEGREE,K,Compare,RecManager>::help(const int tid, const tagptr_t tagptr, SCXRecord<DEGREE,K> const * const snap, const bool helpingOther) {
#ifdef NO_HELPING
    int IGNORED_RETURN_VALUE = -1;
    if (helpingOther) return IGNORED_RETURN_VALUE;
#endif
//    TRACE COUTATOMICTID("help "<<tagptrToString(tagptr)<<" helpingOther="<<helpingOther<<" numNodes="<<snap->c.numberOfNodes<<" numToFreeze="<<snap->c.numberOfNodesToFreeze<<endl);
    SCXRecord<DEGREE,K> *ptr = TAGPTR1_UNPACK_PTR(tagptr);
    //if (helpingOther) { eassert(UNPACK1_SEQ(snap->c.mutables), UNPACK1_SEQ(tagptr)); /*assert(UNPACK1_SEQ(snap->c.mutables) == UNPACK1_SEQ(tagptr));*/ }
    // freeze sub-tree
    for (int i=helpingOther; i<snap->c.numberOfNodesToFreeze; ++i) {
        if (snap->c.nodes[i]->isLeaf()) {
            TRACE COUTATOMICTID((helpingOther?"    ":"")<<"help "<<"nodes["<<i<<"]@"<<"0x"<<((uintptr_t)(snap->c.nodes[i]))<<" is a leaf\n");
            assert(i > 0); // nodes[0] cannot be a leaf...
            continue; // do not freeze leaves
        }
        
        bool successfulCAS = __sync_bool_compare_and_swap(&snap->c.nodes[i]->scxPtr, snap->c.scxPtrsSeen[i], tagptr);
        SCXRecord<DEGREE,K> *exp = snap->c.nodes[i]->scxPtr;
//        TRACE if (successfulCAS) COUTATOMICTID((helpingOther?"    ":"")<<"help froze nodes["<<i<<"]@0x"<<((uintptr_t)snap->c.nodes[i])<<" with tagptr="<<tagptrToString((tagptr_t) snap->c.nodes[i]->scxPtr)<<endl);
        if (successfulCAS || exp == (void*) tagptr) continue; // if node is already frozen for our operation

        // note: we can get here only if:
        // 1. the state is inprogress, and we just failed a cas, and every helper will fail that cas (or an earlier one), so the scx must abort, or
        // 2. the state is committed or aborted
        // (this suggests that it might be possible to get rid of the allFrozen bit)
        
        // read mutable allFrozen field of descriptor
        bool succ;
        bool allFrozen = DESC1_READ_FIELD(succ, ptr->c.mutables, tagptr, MUTABLES1_MASK_ALLFROZEN, MUTABLES1_OFFSET_ALLFROZEN);
        if (!succ) return SCXRecord<DEGREE,K>::STATE_ABORTED;
        
        if (allFrozen) {
            TRACE COUTATOMICTID((helpingOther?"    ":"")<<"help return state "<<SCXRecord<DEGREE comma K>::STATE_COMMITTED<<" after failed freezing cas on nodes["<<i<<"]"<<endl);
            return SCXRecord<DEGREE,K>::STATE_COMMITTED;
        } else {
            const int newState = SCXRecord<DEGREE,K>::STATE_ABORTED;
            TRACE COUTATOMICTID((helpingOther?"    ":"")<<"help return state "<<newState<<" after failed freezing cas on nodes["<<i<<"]"<<endl);
            MUTABLES1_WRITE_FIELD(ptr->c.mutables, snap->c.mutables, newState, MUTABLES1_MASK_STATE, MUTABLES1_OFFSET_STATE);
            return newState;
        }
    }
    
    MUTABLES1_WRITE_BIT(ptr->c.mutables, snap->c.mutables, MUTABLES1_MASK_ALLFROZEN);
    SOFTWARE_BARRIER;
    for (int i=1; i<snap->c.numberOfNodesToFreeze; ++i) {
        if (snap->c.nodes[i]->isLeaf()) continue; // do not mark leaves
        snap->c.nodes[i]->marked = true; // finalize all but first node
    }

    // CAS in the new sub-tree (update CAS)
    rqProvider->linearize_update_at_cas(tid, snap->c.field, snap->c.nodes[1], snap->c.newNode, snap->c.insertedNodes, snap->c.deletedNodes);
//    __sync_bool_compare_and_swap(snap->c.field, snap->c.nodes[1], snap->c.newNode);
    TRACE COUTATOMICTID((helpingOther?"    ":"")<<"help CAS'ed to newNode@0x"<<((uintptr_t)snap->c.newNode)<<endl);

    MUTABLES1_WRITE_FIELD(ptr->c.mutables, snap->c.mutables, SCXRecord<DEGREE comma K>::STATE_COMMITTED, MUTABLES1_MASK_STATE, MUTABLES1_OFFSET_STATE);
    
    TRACE COUTATOMICTID((helpingOther?"    ":"")<<"help return COMMITTED after performing update cas"<<endl);
    return SCXRecord<DEGREE,K>::STATE_COMMITTED; // success
}

#endif	/* ABTREE_IMPL_H */


================================================
FILE: datastructures/trevor_brown_abtree/minimal_example.cpp
================================================
/**
 * Author: Trevor Brown (me [at] tbrown [dot] pro).
 * Copyright 2018.
 * 
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */

#include <iostream>
#include <limits>
#include <cassert>

#include "brown_ext_abtree_lf_adapter.h"

int main(int argc, char** argv) {

    const int NODE_DEGREE = 16;
    const int ANY_KEY = 0;
    const int NUM_THREADS = 1;
    
    auto tree = new ds_adapter<NODE_DEGREE, int>(NUM_THREADS, ANY_KEY);
    
    const int threadID = 0;

    tree->initThread(threadID);
    
    void * oldVal = tree->insertIfAbsent(threadID, 7, (void *) 1020);
    assert(oldVal == tree->getNoValue());
    
    bool result = tree->contains(threadID, 7);
    assert(result);
    
    result = tree->contains(threadID, 8);
    assert(!result);
    
    void * val = tree->find(threadID, 7);
    assert(val == (void *) 1020);
    
    val = tree->erase(threadID, 7);
    assert(val == (void *) 1020);
    
    result = tree->contains(threadID, 7);
    assert(!result);
    
    tree->deinitThread(threadID);
    
    delete tree;
    
    std::cout<<"Passed quick tests."<<std::endl;
            
    return 0;
}


================================================
FILE: datastructures/trevor_brown_natarajan/TrevorBrownNatarajanTree.hpp
================================================
#ifndef _TREVOR_BROWN_NATARAJAN_TREE_HP_H_
#define _TREVOR_BROWN_NATARAJAN_TREE_HP_H_

#include <cassert>
#include <stdexcept>
#include <algorithm>
#include <limits>
#include "common/ThreadRegistry.hpp"
#include "ds/natarajan_ext_bst_lf/natarajan_ext_bst_lf_adapter.h"

/*
 * This is a wrapper to Trevor Brown's implementation of Naratajan's lock=free Tree so we can use it in our benchmarks
 */

template<typename K>
class TrevorBrownNatarajanTree {
    const int NUM_THREADS = 128;
    //ds_adapter<K, K, reclaimer_hazardptr<K>>* tree;
    ds_adapter<K, K>* tree;

public:
    TrevorBrownNatarajanTree(int numThreads) {
        const int minValue = 0;
        const int maxValue = std::numeric_limits<int>::max();
        const int noValue = -1;
        //tree = new ds_adapter<K, K, reclaimer_hazardptr<K>>(minValue, maxValue, noValue, NUM_THREADS);
        tree = new ds_adapter<K, K>(minValue, maxValue, noValue, NUM_THREADS);
    }

    ~TrevorBrownNatarajanTree() {
        // TODO: deinit threads?
        delete tree;
    }

    // Inserts a key only if it's not already present
    bool add(K key, const int tid=0) {
        int threadID = tl_tcico.tid;
        if (threadID == ThreadCheckInCheckOut::NOT_ASSIGNED) {
            threadID = ThreadRegistry::getTID();
            tree->initThread(threadID);
        }
        return tree->insertIfAbsent(threadID, key, 1) != tree->getNoValue();
    }

    // Returns true only if the key was present
    bool remove(K key, const int tid=0) {
        int threadID = tl_tcico.tid;
        if (threadID == ThreadCheckInCheckOut::NOT_ASSIGNED) {
            threadID = ThreadRegistry::getTID();
            tree->initThread(threadID);
        }
        return tree->erase(threadID, key) != tree->getNoValue();
    }

    bool contains(K key, const int tid=0) {
        int threadID = tl_tcico.tid;
        if (threadID == ThreadCheckInCheckOut::NOT_ASSIGNED) {
            threadID = ThreadRegistry::getTID();
            tree->initThread(threadID);
        }
        return tree->contains(threadID, key);
    }

    // This is not fully transactionally but it's ok because we use it only on initialization.
    // We could make it fully transactionally, but we would have to increase the size of allocation/store logs.
    void addAll(K** keys, int size, const int tid=0) {
        for (int i = 0; i < size; i++) add(*keys[i], tid);
    }

    static std::string className() { return "TrevorBrown-Natarajan-Tree"; }

};

#endif   // _TREVOR_BROWN_NATARAJAN_TREE_HP_H_


================================================
FILE: datastructures/trevor_brown_natarajan/ds/natarajan_ext_bst_lf/natarajan_ext_bst_lf_adapter.h
================================================
/* 
 * Implementation of the lock-free tree of Natarajan and Mittal.
 * 
 * Heavily edited by Trevor Brown (me [at] tbrown [dot] pro).
 * (Late 2017, early 2018.)
 * 
 * Notable changes:
 * - Converted original implementation to a class.
 * - Fixed a bug: atomic_ops types don't contain "volatile," so the original
 *       implementation behaved erroneously under high contention.
 * - Fixed the original implementation's erroneous memory reclamation,
 *       which would leak many nodes.
 * 
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * Created on August 31, 2017, 6:22 PM
 */

#ifndef NATARAJAN_EXT_BST_LF_ADAPTER_H
#define NATARAJAN_EXT_BST_LF_ADAPTER_H

#include <iostream>
#include "errors.h"
#include "natarajan_ext_bst_lf_stage2_impl.h"

#define RECORD_MANAGER_T record_manager<Reclaim, Alloc, Pool, node_t<K, V>>
#define DATA_STRUCTURE_T natarajan_ext_bst_lf<K, V, RECORD_MANAGER_T>

template <class K, class V, class Reclaim = reclaimer_debra<K>, class Alloc = allocator_new<K>, class Pool = pool_none<K>>
class ds_adapter {
private:
    const V NO_VALUE;
    DATA_STRUCTURE_T * const tree;

public:
    ds_adapter(const K& MIN_KEY, const K& MAX_KEY, const V& _NO_VALUE, const int numThreads)
    : NO_VALUE(_NO_VALUE)
    , tree(new DATA_STRUCTURE_T(MAX_KEY, NO_VALUE, numThreads))
    {}
    ~ds_adapter() {
        delete tree;
    }
    
    V getNoValue() {
        return NO_VALUE;
    }
    
    void initThread(const int tid) {
        tree->initThread(tid);
    }
    void deinitThread(const int tid) {
        tree->deinitThread(tid);
    }

    bool contains(const int tid, const K& key) {
        return tree->find(tid, key) != getNoValue();
    }
    V insert(const int tid, const K& key, const V& val) {
        error("insert-replace not implemented for this data structure");
    }
    V insertIfAbsent(const int tid, const K& key, const V& val) {
        return tree->insertIfAbsent(tid, key, val);
    }
    V erase(const int tid, const K& key) {
        return tree->erase(tid, key);
    }
    V find(const int tid, const K& key) {
        return tree->find(tid, key);
    }
    int rangeQuery(const int tid, const K& lo, const K& hi, K * const resultKeys, V * const resultValues) {
        error("rangeQuery not implemented for this data structure");
    }
    /**
     * Sequential operation to get the number of keys in the set
     */
    int getSize() {
        return tree->getSize();
    }
    void printSummary() {
        tree->printSummary();
    }
    long long getKeyChecksum() {
        return tree->getKeyChecksum();
    }
    bool validateStructure() {
        return tree->validateStructure();
    }
    void printObjectSizes() {
        std::cout<<"sizes: node="
                 <<(sizeof(node_t<K, V>))
                 <<std::endl;
    }
};

#endif


================================================
FILE: datastructures/trevor_brown_natarajan/ds/natarajan_ext_bst_lf/natarajan_ext_bst_lf_stage1.h
================================================
/*A Lock Free Binary Search Tree
 
 * File:
 *   wfrbt.cpp
 * Author(s):
 *   Aravind Natarajan <natarajan.aravind@gmail.com>
 * Description:
 *   A Lock Free Binary Search Tree
 *
 * Copyright (c) 2013-2014.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.

Please cite our PPoPP 2014 paper - Fast Concurrent Lock-Free Binary Search Trees by Aravind Natarajan and Neeraj Mittal if you use our code in your experiments	

Features:
1. Insert operations directly install their window without injecting the operation into the tree. They help any conflicting operation at the injection point, 
before executing their window txn.
2. Delete operations are the same as that of the original algorithm.
 
 */

/* 
 * File:   wfrbt.h
 * Author: Maya Arbel-Raviv
 *
 * Created on June 8, 2017, 10:45 AM
 */

/*
 * Heavily edited by Trevor Brown (me [at] tbrown [dot] pro).
 * (Late 2017, early 2018.)
 * 
 * - Converted to a class and added proper memory reclamation.
 * - Fixed a bug: atomic_ops types don't contain "volatile," so the original
 *       implementation behaved erroneously under high contention.
 * - Fixed the original implementation's erroneous memory reclamation,
 *       which would leak many nodes.
 */

#ifndef NATARAJAN_EXT_BST_LF_H
#define NATARAJAN_EXT_BST_LF_H

#include "errors.h"
#include "record_manager.h"
#include "atomic_ops.h"

#if     (INDEX_STRUCT == IDX_NATARAJAN_EXT_BST_LF) 
#elif   (INDEX_STRUCT == IDX_NATARAJAN_EXT_BST_LF_BASELINE)
#error cannot support baseline with int keys and no value.  
#else
#error
#endif

// Most of these macros are not used in this algorithm

#define MARK_BIT 1
#define FLAG_BIT 0

#define atomic_cas_full(addr, old_val, new_val) __sync_bool_compare_and_swap(addr, old_val, new_val);
#define create_child_word(addr, mark, flag) (((uintptr_t) addr << 2) + (mark << 1) + (flag))
#define is_marked(x) ( ((x >> 1) & 1)  == 1 ? true:false)
#define is_flagged(x) ( (x & 1 )  == 1 ? true:false)
#define get_addr(x) (x >> 2)
#define add_mark_bit(x) (x + 4UL)
#define is_free(x) (((x) & 3) == 0? true:false)

enum {
    INSERT, DELETE
};

enum {
    UNMARK, MARK
};

enum {
    UNFLAG, FLAG
};

typedef uintptr_t Word;

template <typename skey_t, typename sval_t>
struct node_t {
    union {
        struct {
            skey_t key;
            sval_t value;
            volatile AO_double_t child;
        };
#ifdef MIN_NODE_SIZE
        char bytes[MIN_NODE_SIZE];
#endif
    };
};


template <typename skey_t, typename sval_t>
struct seekRecord_t {
    skey_t leafKey;
    sval_t leafValue;
    struct node_t<skey_t, sval_t>* leaf;
    struct node_t<skey_t, sval_t>* parent;
    AO_t pL;
    bool isLeftL; // is L the left child of P?
    struct node_t<skey_t, sval_t>* lum;
    AO_t lumC;
    bool isLeftUM; // is  last unmarked node's child on access path the left child of  the last unmarked node?
};

template <typename skey_t, typename sval_t>
struct thread_data_t {
    int id;
    struct node_t<skey_t, sval_t>* rootOfTree;
    seekRecord_t<skey_t, sval_t>* sr; // seek record
    seekRecord_t<skey_t, sval_t> * ssr; // secondary seek record
};

//static __thread thread_data_t<skey_t, sval_t> * data = NULL;

template <typename skey_t, typename sval_t, class RecMgr, class Compare = less<skey_t> >
class natarajan_ext_bst_lf {
private:
    RecMgr * const recmgr;
    Compare cmp;
    node_t<skey_t, sval_t> * root;

    seekRecord_t<skey_t, sval_t>* insseek(thread_data_t<skey_t, sval_t>* data, skey_t key, int op);
    seekRecord_t<skey_t, sval_t>* delseek(thread_data_t<skey_t, sval_t>* data, skey_t key, int op);
    seekRecord_t<skey_t, sval_t>* secondary_seek(thread_data_t<skey_t, sval_t>* data, skey_t key, seekRecord_t<skey_t, sval_t>* sr);
    sval_t delete_node(thread_data_t<skey_t, sval_t>* data, skey_t key);
    sval_t insertIfAbsent(thread_data_t<skey_t, sval_t>* data, skey_t key, sval_t value);
    sval_t search(thread_data_t<skey_t, sval_t>* data, skey_t key);
    int help_conflicting_operation (thread_data_t<skey_t, sval_t>* data, seekRecord_t<skey_t, sval_t>* R);
    int inject(thread_data_t<skey_t, sval_t>* data, seekRecord_t<skey_t, sval_t>* R, int op);
    int perform_one_delete_window_operation(thread_data_t<skey_t, sval_t>* data, seekRecord_t<skey_t, sval_t>* R, skey_t key);
    int perform_one_insert_window_operation(thread_data_t<skey_t, sval_t>* data, seekRecord_t<skey_t, sval_t>* R, skey_t newKey, sval_t value);

    void retireDeletedNodes(thread_data_t<skey_t, sval_t>* data, node_t<skey_t, sval_t> * node, node_t<skey_t, sval_t> * targetNode, bool pointerFlagged = false);
    
    int init[MAX_TID_POW2] = {0,};
public:
    const skey_t MAX_KEY;
    const sval_t NO_VALUE;
    const int NUM_PROCESSES;

    natarajan_ext_bst_lf(const skey_t& _MAX_KEY, const sval_t& _NO_VALUE, const int numProcesses)
    : MAX_KEY(_MAX_KEY)
    , NO_VALUE(_NO_VALUE)
    , NUM_PROCESSES(numProcesses)
    , recmgr(new RecMgr(numProcesses, SIGQUIT)) {
        const int tid = 0;
        initThread(tid);
        
        cmp = Compare();

        recmgr->enterQuiescentState(tid); // block crash recovery signal for this thread, and enter an initial quiescent state.

        root = recmgr->template allocate<node_t<skey_t, sval_t>>(tid);
        node_t<skey_t, sval_t> * newLC = recmgr->template allocate<node_t<skey_t, sval_t>>(tid);
        node_t<skey_t, sval_t> * newRC = recmgr->template allocate<node_t<skey_t, sval_t>>(tid);

        memset(newLC, 0, sizeof (struct node_t<skey_t, sval_t>));
        memset(newRC, 0, sizeof (struct node_t<skey_t, sval_t>));

        root->key =  _MAX_KEY;
        newLC->key = _MAX_KEY - 1;
        newRC->key = _MAX_KEY;

        root->value = NO_VALUE;
        newLC->value = NO_VALUE;
        newRC->value = NO_VALUE;

        root->child.AO_val1 = create_child_word(newLC, UNMARK, UNFLAG);
        root->child.AO_val2 = create_child_word(newRC, UNMARK, UNFLAG);
    }

    void freeSubtree(node_t<skey_t, sval_t> * curr) {
        const int tid = 0;
        if (curr == NULL) return;
        node_t<skey_t, sval_t> * left = get_left(curr);
        node_t<skey_t, sval_t> * right = get_right(curr);
        recmgr->deallocate(tid, curr);
        freeSubtree(left);
        freeSubtree(right);
    }
    
    ~natarajan_ext_bst_lf() {
        freeSubtree(root);
        delete recmgr;
    }

    void initThread(const int tid) {
        if (init[tid]) return; else init[tid] = !init[tid];

        recmgr->initThread(tid);
    }

    void deinitThread(const int tid) {
        if (!init[tid]) return; else init[tid] = !init[tid];

        recmgr->deinitThread(tid);
    }

    sval_t insertIfAbsent(const int tid, skey_t key, sval_t item) { 
        assert(cmp(key, MAX_KEY-1));
        thread_data_t<skey_t, sval_t> data; 
        seekRecord_t<skey_t, sval_t> sr; 
        seekRecord_t<skey_t, sval_t> ssr; 
        data.id = tid;
        data.sr = &sr;
        data.ssr = &ssr;
        data.rootOfTree = root; 
        return insertIfAbsent(&data,key,item);
    }

    sval_t erase(const int tid, skey_t key) { 
        assert(cmp(key, MAX_KEY-1));
        thread_data_t<skey_t, sval_t> data; 
        seekRecord_t<skey_t, sval_t> sr; 
        seekRecord_t<skey_t, sval_t> ssr; 
        data.id = tid;
        data.sr = &sr;
        data.ssr = &ssr;
        data.rootOfTree = root;
        return delete_node(&data,key);
    }

    sval_t find(const int tid, skey_t key) {
        thread_data_t<skey_t, sval_t> data; 
        seekRecord_t<skey_t, sval_t> sr; 
        seekRecord_t<skey_t, sval_t> ssr; 
        data.id = tid;
        data.sr = &sr;
        data.ssr = &ssr;
        data.rootOfTree = root;
        return search(&data,key);
    }

    node_t<skey_t, sval_t> * get_root() {
        return root;
    }

    node_t<skey_t, sval_t> * get_left(node_t<skey_t, sval_t> * curr) {
        return (node_t<skey_t, sval_t> *)get_addr(curr->child.AO_val1); 
    }

    node_t<skey_t, sval_t> * get_right(node_t<skey_t, sval_t> * curr) {
        return (node_t<skey_t, sval_t> *)get_addr(curr->child.AO_val2); 
    }
    
    long long getKeyChecksum(node_t<skey_t, sval_t> * curr) {
        if (curr == NULL) return 0;
        node_t<skey_t, sval_t> * left = get_left(curr);
        node_t<skey_t, sval_t> * right = get_right(curr);
        if (!left && !right) return (long long) curr->key; // leaf
        return getKeyChecksum(left) + getKeyChecksum(right);
    }
    
    long long getKeyChecksum() {
        return getKeyChecksum(get_left(get_left(root)));
    }
    
    long long getSize(node_t<skey_t, sval_t> * curr) {
        if (curr == NULL) return 0;
        node_t<skey_t, sval_t> * left = get_left(curr);
        node_t<skey_t, sval_t> * right = get_right(curr);
        if (!left && !right) return 1; // leaf
        return getSize(left) + getSize(right);
    }
    
    bool validateStructure() {
        return true;
    }
    
    long long getSize() {
        return getSize(get_left(get_left(root)));
    }
    
    long long getSizeInNodes(node_t<skey_t, sval_t> * const curr) {
        if (curr == NULL) return 0;
        return 1 + getSizeInNodes(get_left(curr))
                 + getSizeInNodes(get_right(curr));
    }
    
    long long getSizeInNodes() {
        return getSizeInNodes(root);
    }    

    void printSummary() {
        stringstream ss;
        ss<<getSizeInNodes()<<" nodes in tree";
        std::cout<<ss.str()<<std::endl;

        recmgr->printStatus();
    }
};
#endif /* NATARAJAN_EXT_BST_LF_H */


================================================
FILE: datastructures/trevor_brown_natarajan/ds/natarajan_ext_bst_lf/natarajan_ext_bst_lf_stage2_impl.h
================================================
/*A Lock Free Binary Search Tree
 
 * File:
 *   wfrbt.cpp
 * Author(s):
 *   Aravind Natarajan <natarajan.aravind@gmail.com>
 * Description:
 *   A Lock Free Binary Search Tree
 *
 * Copyright (c) 2013-2014.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.

Please cite our PPoPP 2014 paper - Fast Concurrent Lock-Free Binary Search Trees by Aravind Natarajan and Neeraj Mittal if you use our code in your experiments	

Features:
1. Insert operations directly install their window without injecting the operation into the tree. They help any conflicting operation at the injection point, 
before executing their window txn.
2. Delete operations are the same as that of the original algorithm.
 
 */

/* 
 * File:   wfrbt_impl.h
 * Author: Maya Arbel-Raviv
 *
 * Created on June 8, 2017, 10:45 AM
 */

/*
 * Heavily edited by Trevor Brown (me [at] tbrown [dot] pro).
 * (Late 2017, early 2018.)
 * 
 * - Converted to a class and added proper memory reclamation.
 * - Fixed a bug: atomic_ops types don't contain "volatile," so the original
 *       implementation behaved erroneously under high contention.
 * - Fixed the original implementation's erroneous memory reclamation,
 *       which would leak many nodes.
 */

#ifndef NATARAJAN_EXT_BST_LF_IMPL_H
#define NATARAJAN_EXT_BST_LF_IMPL_H

#include "natarajan_ext_bst_lf_stage1.h"

static inline bool SetBit(volatile size_t *array, int bit) {
    bool flag;
    __asm__ __volatile__("lock bts %2,%1; setb %0" : "=q" (flag) : "m" (*array), "r" (bit));
    return flag;
}

static bool mark_Node(volatile AO_t * word) {
    return (SetBit(word, MARK_BIT));
}

static volatile AO_t stop = 0;
static volatile AO_t stop2 = 0;

//long total_insert = 0;

/* STRUCTURES */
enum {
    Front, Back
};

//long blackCount = -1;
//long leafNodes = 0;

template <typename skey_t, typename sval_t, class RecMgr, class Compare>
seekRecord_t<skey_t, sval_t>* natarajan_ext_bst_lf<skey_t, sval_t, RecMgr, Compare>::insseek(thread_data_t<skey_t, sval_t>* data, skey_t key, int op) {

    node_t<skey_t, sval_t> * gpar = NULL; // last node (ancestor of parent on access path) whose child pointer field is unmarked
    node_t<skey_t, sval_t> * par = data->rootOfTree;
    node_t<skey_t, sval_t> * leaf;
    node_t<skey_t, sval_t> * leafchild;


    AO_t parentPointerWord = (size_t) NULL; // contents in gpar
    AO_t leafPointerWord = par->child.AO_val1; // contents in par. Tree has two imaginary keys \inf_{1} and \inf_{2} which are larger than all other keys. 
    AO_t leafchildPointerWord; // contents in leaf

    bool isparLC = false; // is par the left child of gpar
    bool isleafLC = true; // is leaf the left child of par
    bool isleafchildLC; // is leafchild the left child of leaf


    leaf = (node_t<skey_t, sval_t> *)get_addr(leafPointerWord);
    if (cmp(key, leaf->key)) {
        leafchildPointerWord = leaf->child.AO_val1;
        isleafchildLC = true;

    } else {
        leafchildPointerWord = leaf->child.AO_val2;
        isleafchildLC = false;
    }

    leafchild = (node_t<skey_t, sval_t> *)get_addr(leafchildPointerWord);


    while (leafchild != NULL) {


        if (!is_marked(leafPointerWord)) {
            gpar = par;
            parentPointerWord = leafPointerWord;
            isparLC = isleafLC;
        }

        par = leaf;
        leafPointerWord = leafchildPointerWord;
        isleafLC = isleafchildLC;

        leaf = leafchild;


        if (cmp(key, leaf->key)) {
            leafchildPointerWord = leaf->child.AO_val1;
            isleafchildLC = true;
        } else {
            leafchildPointerWord = leaf->child.AO_val2;
            isleafchildLC = false;
        }

        leafchild = (node_t<skey_t, sval_t> *)get_addr(leafchildPointerWord);

    }

//    if (key == leaf->key) {
//        // key matches that being inserted	
//        return NULL;
//    }

    seekRecord_t<skey_t, sval_t>* R = data->sr;
    R->leafKey = leaf->key;
    R->leafValue = leaf->value;
    R->parent = par;
    R->pL = leafPointerWord;
    R->isLeftL = isleafLC;
    R->lum = gpar;
    R->lumC = parentPointerWord;
    R->isLeftUM = isparLC;
    return R;
}

template <typename skey_t, typename sval_t, class RecMgr, class Compare>
seekRecord_t<skey_t, sval_t>* natarajan_ext_bst_lf<skey_t, sval_t, RecMgr, Compare>::delseek(thread_data_t<skey_t, sval_t>* data, skey_t key, int op) {
    node_t<skey_t, sval_t> * gpar = NULL; // last node (ancestor of parent on access path) whose child pointer field is unmarked
    node_t<skey_t, sval_t> * par = data->rootOfTree;
    node_t<skey_t, sval_t> * leaf;
    node_t<skey_t, sval_t> * leafchild;


    AO_t parentPointerWord = (AO_t) NULL; // contents in gpar
    AO_t leafPointerWord = par->child.AO_val1; // contents in par. Tree has two imaginary keys \inf_{1} and \inf_{2} which are larger than all other keys. 
    AO_t leafchildPointerWord; // contents in leaf

    bool isparLC = false; // is par the left child of gpar
    bool isleafLC = true; // is leaf the left child of par
    bool isleafchildLC; // is leafchild the left child of leaf


    leaf = (node_t<skey_t, sval_t> *)get_addr(leafPointerWord);
    if (cmp(key, leaf->key)) {
        leafchildPointerWord = leaf->child.AO_val1;
        isleafchildLC = true;

    } else {
        leafchildPointerWord = leaf->child.AO_val2;
        isleafchildLC = false;
    }

    leafchild = (node_t<skey_t, sval_t> *)get_addr(leafchildPointerWord);


    while (leafchild != NULL) {


        if (!is_marked(leafPointerWord)) {
            gpar = par;
            parentPointerWord = leafPointerWord;
            isparLC = isleafLC;
        }

        par = leaf;
        leafPointerWord = leafchildPointerWord;
        isleafLC = isleafchildLC;

        leaf = leafchild;


        if (cmp(key, leaf->key)) {
            leafchildPointerWord = leaf->child.AO_val1;
            isleafchildLC = true;
        } else {
            leafchildPointerWord = leaf->child.AO_val2;
            isleafchildLC = false;
        }

        leafchild = (node_t<skey_t, sval_t> *)get_addr(leafchildPointerWord);

    }

    // op = DELETE
    if (key != leaf->key) {
        // key is not found in the tree.
        return NULL;
    }

    seekRecord_t<skey_t, sval_t>* R = data->sr;
    R->leafKey = leaf->key;
    R->leafValue = leaf->value;
    R->parent = par;
    R->leaf = leaf;
    R->pL = leafPointerWord;
    R->isLeftL = isleafLC;
    R->lum = gpar;
    R->lumC = parentPointerWord;
    R->isLeftUM = isparLC;


    return R;
}

template <typename skey_t, typename sval_t, class RecMgr, class Compare>
seekRecord_t<skey_t, sval_t>* natarajan_ext_bst_lf<skey_t, sval_t, RecMgr, Compare>::secondary_seek(thread_data_t<skey_t, sval_t>* data, skey_t key, seekRecord_t<skey_t, sval_t>* sr) {

    //std::cout << "sseek" << std::endl;
    node_t<skey_t, sval_t> * flaggedLeaf = (node_t<skey_t, sval_t> *)get_addr(sr->pL);
    node_t<skey_t, sval_t> * gpar = NULL; // last node (ancestor of parent on access path) whose child pointer field is unmarked
    node_t<skey_t, sval_t> * par = data->rootOfTree;
    node_t<skey_t, sval_t> * leaf;
    node_t<skey_t, sval_t> * leafchild;


    AO_t parentPointerWord = (AO_t) NULL; // contents in gpar
    AO_t leafPointerWord = par->child.AO_val1; // contents in par. Tree has two imaginary keys \inf_{1} and \inf_{2} which are larger than all other keys. 
    AO_t leafchildPointerWord; // contents in leaf

    bool isparLC = false; // is par the left child of gpar
    bool isleafLC = true; // is leaf the left child of par
    bool isleafchildLC; // is leafchild the left child of leaf


    leaf = (node_t<skey_t, sval_t> *)get_addr(leafPointerWord);
    if (cmp(key, leaf->key)) {
        leafchildPointerWord = leaf->child.AO_val1;
        isleafchildLC = true;

    } else {
        leafchildPointerWord = leaf->child.AO_val2;
        isleafchildLC = false;
    }

    leafchild = (node_t<skey_t, sval_t> *)get_addr(leafchildPointerWord);


    while (leafchild != NULL) {


        if (!is_marked(leafPointerWord)) {
            gpar = par;
            parentPointerWord = leafPointerWord;
            isparLC = isleafLC;
        }

        par = leaf;
        leafPointerWord = leafchildPointerWord;
        isleafLC = isleafchildLC;

        leaf = leafchild;


        if (cmp(key, leaf->key)) {
            leafchildPointerWord = leaf->child.AO_val1;
            isleafchildLC = true;
        } else {
            leafchildPointerWord = leaf->child.AO_val2;
            isleafchildLC = false;
        }

        leafchild = (node_t<skey_t, sval_t> *)get_addr(leafchildPointerWord);

    }


    if (!is_flagged(leafPointerWord) || (leaf != flaggedLeaf)) {
        // operation has been completed by another process.
        return NULL;
    }

    seekRecord_t<skey_t, sval_t>* R = data->ssr;

    R->leafKey = leaf->key;
    R->parent = par;
    R->pL = leafPointerWord;
    R->isLeftL = isleafLC;
    R->lum = gpar;
    R->lumC = parentPointerWord;
    R->isLeftUM = isparLC;
    return R;
}

template <typename skey_t, typename sval_t, class RecMgr, class Compare>
sval_t natarajan_ext_bst_lf<skey_t, sval_t, RecMgr, Compare>::search(thread_data_t<skey_t, sval_t>* data, skey_t key) {
    recmgr->leaveQuiescentState(data->id);
    node_t<skey_t, sval_t> * cur = (node_t<skey_t, sval_t> *)get_addr(data->rootOfTree->child.AO_val1);
    skey_t lastKey = 0; 
    node_t<skey_t, sval_t> * lastNode = NULL;
    while (cur != NULL) {
        lastKey = cur->key;
        lastNode = cur;
        cur = (cmp(key, lastKey) ? (node_t<skey_t, sval_t> *)get_addr(cur->child.AO_val1) : (node_t<skey_t, sval_t> *)get_addr(cur->child.AO_val2));
    }
    if (key == lastKey) {
        recmgr->enterQuiescentState(data->id);
        return lastNode->value;
    }
    recmgr->enterQuiescentState(data->id);
    return NO_VALUE;
}

//-------------------------------------------------------------------------------------------------------------------------------------------------------
//-------------------------------------------------------------------------------------------------------------------------------------------------------

template <typename skey_t, typename sval_t, class RecMgr, class Compare>
void natarajan_ext_bst_lf<skey_t, sval_t, RecMgr, Compare>::retireDeletedNodes(thread_data_t<skey_t, sval_t>* data, node_t<skey_t, sval_t> * node, node_t<skey_t, sval_t> * targetNode, bool pointerFlagged) {
    // traverse from node, retiring everything we deleted
    // (that is: every leaf pointed to by a flagged pointer,
    //  and every internal node with a flagged pointer.)
    if (node == NULL) return;
    if (node == targetNode) return; // we reached the end of the nodes we deleted
    if ((node_t<skey_t, sval_t> *) node->child.AO_val1 == NULL) {
        // node is a leaf
        if (pointerFlagged) {
            recmgr->retire(data->id, node);
        }
        return;
    }
    // node is internal
    if (is_flagged(node->child.AO_val1) || is_flagged(node->child.AO_val2)) {
        recmgr->retire(data->id, node);
        if (!is_free(node->child.AO_val1)) retireDeletedNodes(data, (node_t<skey_t, sval_t> *) get_addr(node->child.AO_val1), targetNode, is_flagged(node->child.AO_val1));
        if (!is_free(node->child.AO_val2)) retireDeletedNodes(data, (node_t<skey_t, sval_t> *) get_addr(node->child.AO_val2), targetNode, is_flagged(node->child.AO_val2));
    }
}

template <typename skey_t, typename sval_t, class RecMgr, class Compare>
int natarajan_ext_bst_lf<skey_t, sval_t, RecMgr, Compare>::help_conflicting_operation(thread_data_t<skey_t, sval_t>* data, seekRecord_t<skey_t, sval_t>* R) {
    int result;
    node_t<skey_t, sval_t> * target = NULL;
    if (is_flagged(R->pL)) {
        // leaf node is flagged for deletion by another process.

        //1. mark sibling of leaf node for deletion and then read its contents.

        AO_t pS;

        if (R->isLeftL) {
            // L is the left child of P
            mark_Node(&R->parent->child.AO_val2);
            pS = R->parent->child.AO_val2;

        } else {
            mark_Node(&R->parent->child.AO_val1);
            pS = R->parent->child.AO_val1;
        }

        // 2. Execute cas on the last unmarked node to remove the 
        // if pS is flagged, propagate it. 
        AO_t newWord;

        if (is_flagged(pS)) {
            newWord = create_child_word((node_t<skey_t, sval_t> *)get_addr(pS), UNMARK, FLAG);
        } else {
            newWord = create_child_word((node_t<skey_t, sval_t> *)get_addr(pS), UNMARK, UNFLAG);
        }
        target = (node_t<skey_t, sval_t> *) get_addr(pS);

        if (R->isLeftUM) {
            result = atomic_cas_full(&R->lum->child.AO_val1, R->lumC, newWord);
        } else {
            result = atomic_cas_full(&R->lum->child.AO_val2, R->lumC, newWord);
        }

    } else {
        // leaf node is marked for deletion by another process.
        // Note that leaf is not flagged, as it will be taken care of in the above case.

        AO_t newWord;

        if (is_flagged(R->pL)) {
            newWord = create_child_word((node_t<skey_t, sval_t> *)get_addr(R->pL), UNMARK, FLAG);
        } else {
            newWord = create_child_word((node_t<skey_t, sval_t> *)get_addr(R->pL), UNMARK, UNFLAG);
        }

        target = (node_t<skey_t, sval_t> *) get_addr(R->pL);

        if (R->isLeftUM) {
            result = atomic_cas_full(&R->lum->child.AO_val1, R->lumC, newWord);
        } else {
            result = atomic_cas_full(&R->lum->child.AO_val2, R->lumC, newWord);
        }
    }

    if (result) {
        retireDeletedNodes(data, (node_t<skey_t, sval_t> *) get_addr(R->lumC), target);
    }
    
    return result;    
}

//-------------------------------------------------------------------------------------------------------------------------------------------------------
//-------------------------------------------------------------------------------------------------------------------------------------------------------

template <typename skey_t, typename sval_t, class RecMgr, class Compare>
int natarajan_ext_bst_lf<skey_t, sval_t, RecMgr, Compare>::inject(thread_data_t<skey_t, sval_t>* data, seekRecord_t<skey_t, sval_t>* R, int op) {

    // pL is free		
    //1. Flag L		
    AO_t newWord = create_child_word((node_t<skey_t, sval_t> *)get_addr(R->pL), UNMARK, FLAG);
    int result;
    if (R->isLeftL) {
        result = atomic_cas_full(&R->parent->child.AO_val1, R->pL, newWord);

    } else {
        result = atomic_cas_full(&R->parent->child.AO_val2, R->pL, newWord);
    }
    return result;
}

template <typename skey_t, typename sval_t, class RecMgr, class Compare>
sval_t natarajan_ext_bst_lf<skey_t, sval_t, RecMgr, Compare>::insertIfAbsent(thread_data_t<skey_t, sval_t>* data, skey_t key, sval_t value) {
    int injectResult;
//    int fasttry = 0;
    while (true) {
        recmgr->leaveQuiescentState(data->id);
        seekRecord_t<skey_t, sval_t>* R = insseek(data, key, INSERT);
//        fasttry++;
        if (R->leafKey == key) {
//            if (fasttry == 1) {
                return R->leafValue;
//            } else {
//                return NO_VALUE;
//            }
        }
        if (!is_free(R->pL)) {
            help_conflicting_operation(data, R);

            recmgr->enterQuiescentState(data->id);
            continue;
        }
        // key not present in the tree. Insert		
        injectResult = perform_one_insert_window_operation(data, R, key, value);
        if (injectResult == 1) {
            // Operation injected and executed			
            recmgr->enterQuiescentState(data->id);
            return NO_VALUE;
        }
        recmgr->enterQuiescentState(data->id);
    }
    // execute insert window operation.		
}

template <typename skey_t, typename sval_t, class RecMgr, class Compare>
sval_t natarajan_ext_bst_lf<skey_t, sval_t, RecMgr, Compare>::delete_node(thread_data_t<skey_t, sval_t>* data, skey_t key) {

    int injectResult;
    sval_t retval = NO_VALUE;
    while (true) {
        recmgr->leaveQuiescentState(data->id);
        seekRecord_t<skey_t, sval_t>* R = delseek(data, key, DELETE);
        if (R == NULL) {
            recmgr->enterQuiescentState(data->id);
            return retval;
        }
        // key is present in the tree. Inject operation into the tree		
        if (!is_free(R->pL)) {

            help_conflicting_operation(data, R);

            recmgr->enterQuiescentState(data->id);
            continue;
        }
        injectResult = inject(data, R, DELETE);
        if (injectResult == 1) {
            retval = R->leafValue;
//            recmgr->retire(data->id, R->leaf); // if we won consensus and injected the operation, we retire the replaced leaf. (the replaced parent is retired by the guy who marks the sibling pointer in the parent.)
            // Operation injected
            //data->numActualDelete++;
            int res = perform_one_delete_window_operation(data, R, key);
            if (res == 1) {
                // operation successfully executed.
                recmgr->enterQuiescentState(data->id);
                return retval;
            } else {
                // window transaction could not be executed.
                // perform secondary seek.				
                while (true) {
                    R = secondary_seek(data, key, R);
                    if (R == NULL) {
                        // flagged leaf not found. Operation has been executed by some other process.
                        recmgr->enterQuiescentState(data->id);
                        return retval;
                    }
                    res = perform_one_delete_window_operation(data, R, key);
                    if (res == 1) {
                        recmgr->enterQuiescentState(data->id);
                        return retval;
                    }
                }
            }
        }
        recmgr->enterQuiescentState(data->id);
        // otherwise, operation was not injected. Restart.
    }
}

template <typename skey_t, typename sval_t, class RecMgr, class Compare>
int natarajan_ext_bst_lf<skey_t, sval_t, RecMgr, Compare>::perform_one_insert_window_operation(thread_data_t<skey_t, sval_t>* data, seekRecord_t<skey_t, sval_t>* R, skey_t newKey, sval_t value) {
    node_t<skey_t, sval_t> * newInt;
    node_t<skey_t, sval_t> * newLeaf;
    //		if(data->recycledNodes.empty()){		
//    node_t<skey_t, sval_t> * allocedNodeArr = (node_t<skey_t, sval_t> *)malloc(2 * sizeof (struct node_t<skey_t, sval_t>)); // new pointerNode_t[2];
//    newInt = &allocedNodeArr[0];
//    newLeaf = &allocedNodeArr[1];
    newInt = recmgr->template allocate<node_t<skey_t, sval_t>>(data->id);
    if (newInt == NULL) {
        error("out of memory");
    }
#ifdef __HANDLE_STATS
    GSTATS_APPEND(data->id, node_allocated_addresses, (long long) newInt);
#endif
    newLeaf = recmgr->template allocate<node_t<skey_t, sval_t>>(data->id);
    if (newLeaf == NULL) {
        error("out of memory");
    }
#ifdef __HANDLE_STATS
    GSTATS_APPEND(data->id, node_allocated_addresses, (long long) newLeaf);
#endif

    /*		}	
                    else{
                            // reuse memory of previously allocated nodes.
                            newInt = data->recycledNodes.back();
                            data->recycledNodes.pop_back();			
                            newLeaf = data->recycledNodes.back();
                            data->recycledNodes.pop_back();
                    }
     */
    newLeaf->child.AO_val1 = (size_t) NULL;
    newLeaf->child.AO_val2 = (size_t) NULL;
    newLeaf->key = newKey;
    newLeaf->value = value;
    node_t<skey_t, sval_t> * existLeaf = (node_t<skey_t, sval_t> *)get_addr(R->pL);

    skey_t existKey = R->leafKey;


    if (cmp(newKey, existKey)) {
        // key is to be inserted on lchild
        newInt->key = existKey;
        newInt->child.AO_val1 = create_child_word(newLeaf, 0, 0);
        newInt->child.AO_val2 = create_child_word(existLeaf, 0, 0);

    } else {
        // key is to be inserted on rchild
        newInt->key = newKey;
        newInt->child.AO_val2 = create_child_word(newLeaf, 0, 0);
        newInt->child.AO_val1 = create_child_word(existLeaf, 0, 0);

    }

    // cas to replace window		
    AO_t newCasField;
    newCasField = create_child_word(newInt, UNMARK, UNFLAG);
    int result;

    if (R->isLeftL) {
        result = atomic_cas_full(&R->parent->child.AO_val1, R->pL, newCasField);
    } else {
        result = atomic_cas_full(&R->parent->child.AO_val2, R->pL, newCasField);
    }
    if (result == 1) {
        // successfully inserted.			
        //data->numInsert++;
        return 1;
    } else {
        // reuse data and pointer nodes				
        recmgr->deallocate(data->id, newInt);
        recmgr->deallocate(data->id, newLeaf);
        //data->recycledNodes.push_back(newInt);
        //data->recycledNodes.push_back(newLeaf);
        return 0;

    }

}

/*************************************************************************************************/

template <typename skey_t, typename sval_t, class RecMgr, class Compare>
int natarajan_ext_bst_lf<skey_t, sval_t, RecMgr, Compare>::perform_one_delete_window_operation(thread_data_t<skey_t, sval_t>* data, seekRecord_t<skey_t, sval_t>* R, skey_t key) {
    // mark sibling.
    AO_t pS;
    bool markResult = 0;
    if (R->isLeftL) {
        // L is the left child of P
        markResult = mark_Node(&R->parent->child.AO_val2);
        pS = R->parent->child.AO_val2;

    } else {
        markResult = mark_Node(&R->parent->child.AO_val1);
        pS = R->parent->child.AO_val1;
    }
    //cout<<"key="<<R->leafKey<<" markResult="<<markResult<<std::endl;
//    if (!markResult) {
//        // if we won the marking test&set, then we retire R->parent
//        recmgr->retire(data->id, R->parent);
//    }

    AO_t newWord;

    if (is_flagged(pS)) {
        newWord = create_child_word((node_t<skey_t, sval_t> *)get_addr(pS), UNMARK, FLAG);
    } else {
        newWord = create_child_word((node_t<skey_t, sval_t> *)get_addr(pS), UNMARK, UNFLAG);
    }

    int result;

    if (R->isLeftUM) {
        result = atomic_cas_full(&R->lum->child.AO_val1, R->lumC, newWord);
    } else {
        result = atomic_cas_full(&R->lum->child.AO_val2, R->lumC, newWord);
    }
    
    if (result) {
        retireDeletedNodes(data, (node_t<skey_t, sval_t> *) get_addr(R->lumC), (node_t<skey_t, sval_t> *) get_addr(pS));
    }
    
    return result;
}

#endif /* NATARAJAN_EXT_BST_LF_IMPL_H */


================================================
FILE: graphs/BenchmarkLatencyCounter.hpp
================================================
/*
 * Copyright 2017-2018
 *   Andreia Correia <andreia.veiga@unine.ch>
 *   Pedro Ramalhete <pramalhe@gmail.com>
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Nachshon Cohen <nachshonc@gmail.com>
 *
 * This work is published under the MIT license. See LICENSE.txt
 */
#ifndef _BENCHMARK_LATENCY_COUNTER_H_
#define _BENCHMARK_LATENCY_COUNTER_H_

#include <atomic>
#include <chrono>
#include <thread>
#include <string>
#include <vector>
#include <algorithm>


using namespace std;
using namespace chrono;


/**
 * This is a micro-benchmark for measuring on an array of counters
 */
class BenchmarkLatencyCounter {

private:
    // Latency constants
    static const long long kLatencyMeasures =   1000000LL;   // We measure 100M iterations
    static const long long kLatencyWarmups =     100000LL;   // Plus these many warmup

    static const long long NSEC_IN_SEC = 1000000000LL;
    static const uint64_t NUM_COUNTERS = 64;

    int numThreads;

public:
    struct Result {
        uint64_t delay50000;
        uint64_t delay90000;
        uint64_t delay99000;
        uint64_t delay99900;
        uint64_t delay99990;
        uint64_t delay99999;
    };

    BenchmarkLatencyCounter(int numThreads) {
        this->numThreads = numThreads;
    }

    /*
     * Execute latency benchmarks
     * We only do one run for this benchmark
     */
    template<typename TM, template<typename> class TMTYPE>
    Result latencyBenchmark(std::string& className) {
        atomic<bool> start = { false };
        TMTYPE<uint64_t> *counters;
        TM::template updateTx([&] () { // It's ok to pass by reference because we're single-threaded
            counters = (TMTYPE<uint64_t>*)TM::tmMalloc(sizeof(TMTYPE<uint64_t>)*NUM_COUNTERS);
            for (int i = 0; i < NUM_COUNTERS; i++) counters[i] = 0;
        });

        auto latency_lambda = [this,&start,&counters](nanoseconds* delays, const int tid) {
            long long delayIndex = 0;
            while (!start.load()) this_thread::yield();
            // Warmup + Measurements
            for (int iter=0; iter < (kLatencyWarmups+kLatencyMeasures)/numThreads; iter++) {
                // Alternate transactions between left-right and right-left
                auto startBeats = steady_clock::now();
                TM::updateTx([=] () {
                    for (int i = 0; i < NUM_COUNTERS; i++) counters[i] = counters[i]+1;
                });
                auto stopBeats = steady_clock::now();
                if (iter >= kLatencyWarmups/numThreads) delays[delayIndex++] = (stopBeats-startBeats);
                TM::updateTx([=] () {
                    for (int i = NUM_COUNTERS-1; i > 0; i--) counters[i] = counters[i]+1;
                });
            }
        };

        nanoseconds* delays[numThreads];
        for (int it = 0; it < numThreads; it++) {
            delays[it] = new nanoseconds[kLatencyMeasures/numThreads];
            for (int imeas=0; imeas < kLatencyMeasures/numThreads; imeas++) delays[it][imeas] = 0ns;
        }

        cout << "##### " << TM::className() << " #####  \n";
        className = TM::className();
        thread latencyThreads[numThreads];
        for (int tid = 0; tid < numThreads; tid++) latencyThreads[tid] = thread(latency_lambda, delays[tid], tid);
        start.store(true);
        this_thread::sleep_for(50ms);
        for (int tid = 0; tid < numThreads; tid++) latencyThreads[tid].join();

        // Aggregate all the delays for enqueues and dequeues and compute the maxs
        cout << "Aggregating delays for " << kLatencyMeasures/1000000 << " million measurements...\n";
        vector<nanoseconds> aggDelay(kLatencyMeasures);
        long long idx = 0;
        for (int it = 0; it < numThreads; it++) {
            for (int i = 0; i < kLatencyMeasures/numThreads; i++) {
                aggDelay[idx] = delays[it][i];
                idx++;
            }
        }

        // Sort the aggregated delays
        cout << "Sorting delays...\n";
        sort(aggDelay.begin(), aggDelay.end());

        // Show the 50% (median), 90%, 99%, 99.9%, 99.99%, 99.999% and maximum in microsecond/nanoseconds units
        long per50000 = (long)(kLatencyMeasures*50000LL/100000LL);
        long per70000 = (long)(kLatencyMeasures*70000LL/100000LL);
        long per80000 = (long)(kLatencyMeasures*80000LL/100000LL);
        long per90000 = (long)(kLatencyMeasures*90000LL/100000LL);
        long per99000 = (long)(kLatencyMeasures*99000LL/100000LL);
        long per99900 = (long)(kLatencyMeasures*99900LL/100000LL);
        long per99990 = (long)(kLatencyMeasures*99990LL/100000LL);
        long per99999 = (long)(kLatencyMeasures*99999LL/100000LL);
        long imax = kLatencyMeasures-1;

        cout << "Enqueue delay (us): 50%=" << aggDelay[per50000].count()/1000 << "  70%=" << aggDelay[per70000].count()/1000 << "  80%=" << aggDelay[per80000].count()/1000
             << "  90%=" << aggDelay[per90000].count()/1000 << "  99%=" << aggDelay[per99000].count()/1000
             << "  99.9%=" << aggDelay[per99900].count()/1000 << "  99.99%=" << aggDelay[per99990].count()/1000
             << "  99.999%=" << aggDelay[per99999].count()/1000 << "  max=" << aggDelay[imax].count()/1000 << "\n";

        Result res = {
            (uint64_t)aggDelay[per50000].count()/1000, (uint64_t)aggDelay[per90000].count()/1000,
            (uint64_t)aggDelay[per99000].count()/1000, (uint64_t)aggDelay[per99900].count()/1000,
            (uint64_t)aggDelay[per99990].count()/1000, (uint64_t)aggDelay[per99999].count()/1000
        };
/*
        // Show in csv format
        cout << "delay (us):\n";
        cout << "50, " << aggDelay[per50000].count()/1000 << "\n";
        cout << "90, " << aggDelay[per90000].count()/1000 << "\n";
        cout << "99, " << aggDelay[per99000].count()/1000 << "\n";
        cout << "99.9, " << aggDelay[per99900].count()/1000 << "\n";
        cout << "99.99, " << aggDelay[per99990].count()/1000 << "\n";
        cout << "99.999, " << aggDelay[per99999].count()/1000 << "\n";
*/
        TM::template updateTx([&] () { // It's ok to pass by reference because we're single-threaded
            TM::tmFree(counters);
        });

        // Cleanup
        for (int it = 0; it < numThreads; it++) delete[] delays[it];
        return res;
    }


#ifdef NEVER
public:

    static void allLatencyTests() {
        // Burst Latency benchmarks
        //vector<int> threadList = { 30, 30, 30, 30, 30, 30, 30 }; // For the latency table in the paper
        //vector<int> threadList = { 4 };
        vector<int> threadList = { 1, 2, 4, 8, 12, 16, 20, 24, 28, 30, 32 };

        for (int nThreads : threadList) {
            BenchmarkLatencyQ bench(nThreads, 0, 0s); // Only the numThreads is used in this test
            std::cout << "\n----- Burst Latency   numThreads=" << bench.numThreads << "   kLatencyMeasures=" << kLatencyMeasures/1000000LL << "M -----\n";
            bench.latencyBurstBenchmark<MichaelScottQueue<UserData>>();
        }
        for (int nThreads : threadList) {
            BenchmarkLatencyQ bench(nThreads, 0, 0s); // Only the numThreads is used in this test
            std::cout << "\n----- Burst Latency   numThreads=" << bench.numThreads << "   kLatencyMeasures=" << kLatencyMeasures/1000000LL << "M -----\n";
            bench.latencyBurstBenchmark<BitNextQueue<UserData>>();
        }
        for (int nThreads : threadList) {
            BenchmarkLatencyQ bench(nThreads, 0, 0s); // Only the numThreads is used in this test
            std::cout << "\n----- Burst Latency   numThreads=" << bench.numThreads << "   kLatencyMeasures=" << kLatencyMeasures/1000000LL << "M -----\n";
            bench.latencyBurstBenchmark<BitNextLazyHeadQueue<UserData>>();
        }
        /*
        for (int nThreads : threadList) {
            BenchmarkLatencyQ bench(nThreads, 0, 0s); // Only the numThreads is used in this test
            std::cout << "\n----- Burst Latency   numThreads=" << bench.numThreads << "   kLatencyMeasures=" << kLatencyMeasures/1000000LL << "M -----\n";
            bench.latencyBurstBenchmark<KoganPetrankQueueCHP<UserData>>();
        }
        for (int nThreads : threadList) {
            BenchmarkLatencyQ bench(nThreads, 0, 0s); // Only the numThreads is used in this test
            std::cout << "\n----- Burst Latency   numThreads=" << bench.numThreads << "   kLatencyMeasures=" << kLatencyMeasures/1000000LL << "M -----\n";
            bench.latencyBurstBenchmark<CRTurnQueue<UserData>>();
        }
        */
    }
#endif
};

#endif


================================================
FILE: graphs/BenchmarkLatencyQueues.hpp
================================================
/*
 * Copyright 2017-2018
 *   Andreia Correia <andreia.veiga@unine.ch>
 *   Pedro Ramalhete <pramalhe@gmail.com>
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Nachshon Cohen <nachshonc@gmail.com>
 *
 * This work is published under the MIT license. See LICENSE.txt
 */
#ifndef _BENCHMARK_LATENCY_Q_H_
#define _BENCHMARK_LATENCY_Q_H_

#include <atomic>
#include <chrono>
#include <thread>
#include <string>
#include <vector>
#include <algorithm>


using namespace std;
using namespace chrono;


/**
 * This is a micro-benchmark for measuring latency
 */
class BenchmarkLatencyQueues {

private:
    struct UserData  {
        long long seq;
        int tid;
        UserData(long long lseq, int ltid) {
            this->seq = lseq;
            this->tid = ltid;
        }
        UserData() {
            this->seq = -2;
            this->tid = -2;
        }
        UserData(const UserData &other) : seq(other.seq), tid(other.tid) { }

        bool operator < (const UserData& other) const {
            return seq < other.seq;
        }
    };

    struct Result {
        nanoseconds nsEnq = 0ns;
        nanoseconds nsDeq = 0ns;
        long long numEnq = 0;
        long long numDeq = 0;
        long long totOpsSec = 0;

        Result() { }

        Result(const Result &other) {
            nsEnq = other.nsEnq;
            nsDeq = other.nsDeq;
            numEnq = other.numEnq;
            numDeq = other.numDeq;
            totOpsSec = other.totOpsSec;
        }

        bool operator < (const Result& other) const {
            return totOpsSec < other.totOpsSec;
        }
    };

    // Latency constants
    static const long long kLatencyMeasures =     200000000LL;   // We measure 100M iterations divided among the different threads
    static const long long kLatencyWarmupIterations =    10;     // At start of latency tests we do 10M warmup enqueues and dequeues in bursts of 100K
    static const long long kLatencyIterations =         200;     // We do this many iterations of 100K enqueues and dequeues until we get kLatencyMeasures
    static const long long kLatencyBurst = kLatencyMeasures/kLatencyIterations;

    static const long long NSEC_IN_SEC = 1000000000LL;

    int numThreads;
    int numRuns;
    seconds testLengthSeconds;

public:
    BenchmarkLatencyQueues(int numThreads, int numRuns, seconds testLength) {
        this->numThreads = numThreads;
        this->numRuns = numRuns;
        this->testLengthSeconds = testLength;
    }

    /*
     * Execute latency benchmarks
     * Make sure to enable high priority for the Windows process
     *
     * We can use this Mathematica function to compute the Inverse CDF of a Poisson and model the latency at 99.99% for lock-free algorithms:
     * https://reference.wolfram.com/language/ref/InverseCDF.html
     *
     * We only do one run for this benchmark
     */
    template<typename Q>
    void latencyBurstBenchmark() {
        atomic<bool> startEnq = { false };
        atomic<bool> startDeq = { false };
        atomic<long> barrier = { 0 };
        Q* queue = new Q(numThreads);

        auto latency_lambda = [this,&startEnq,&startDeq,&barrier,&queue](nanoseconds* enqDelays, nanoseconds* deqDelays, const int tid) {
            UserData ud(0,0);
            long long enqDelayIndex = 0;
            long long deqDelayIndex = 0;

            // Warmup + Measurements
            for (int iter=0; iter < kLatencyIterations+kLatencyWarmupIterations; iter++) {
                // Start with enqueues
                while (!startEnq.load()) this_thread::yield();
                for (long long i = 0; i < kLatencyBurst/numThreads; i++) {
                    auto startBeats = steady_clock::now();
                    queue->enqueue(&ud, tid);
                    auto stopBeats = steady_clock::now();
                    if (iter >= kLatencyWarmupIterations) enqDelays[enqDelayIndex++] = (stopBeats-startBeats);
                }
                if (barrier.fetch_add(1) == numThreads) cout << "ERROR: in barrier\n";
                // dequeues
                while (!startDeq.load()) this_thread::yield();
                for (long long i = 0; i < kLatencyBurst/numThreads; i++) {
                    auto startBeats = steady_clock::now();
                    if (queue->dequeue(tid) == nullptr) cout << "ERROR: dequeued nullptr in i=" << i << "\n";
                    auto stopBeats = steady_clock::now();
                    if (iter >= kLatencyWarmupIterations) deqDelays[deqDelayIndex++] = (stopBeats-startBeats);
                }
                if (barrier.fetch_add(1) == numThreads) cout << "ERROR: in barrier\n";
            }
        };

        nanoseconds* enqDelays[numThreads];  // Half enqueues and half dequeues
        nanoseconds* deqDelays[numThreads];
        for (int it = 0; it < numThreads; it++) {
            enqDelays[it] = new nanoseconds[kLatencyMeasures/numThreads];
            deqDelays[it] = new nanoseconds[kLatencyMeasures/numThreads];
            for (int imeas=0; imeas < kLatencyMeasures/numThreads; imeas++) {
                enqDelays[it][imeas] = 0ns;
                deqDelays[it][imeas] = 0ns;
            }
        }

        cout << "##### " << queue->className() << " #####  \n";
        thread latencyThreads[numThreads];
        for (int tid = 0; tid < numThreads; tid++) latencyThreads[tid] = thread(latency_lambda, enqDelays[tid], deqDelays[tid], tid);
        this_thread::sleep_for(50ms);
        for (int iter=0; iter < kLatencyIterations+kLatencyWarmupIterations; iter++) {
            // enqueue round
            startEnq.store(true);
            while (barrier.load() != numThreads) this_thread::yield();
            startEnq.store(false);
            long tmp =  numThreads;
            if (!barrier.compare_exchange_strong(tmp, 0)) cout << "ERROR: CAS\n";
            // dequeue round
            startDeq.store(true);
            while (barrier.load() != numThreads) this_thread::yield();
            startDeq.store(false);
            tmp = numThreads;
            if (!barrier.compare_exchange_strong(tmp, 0)) cout << "ERROR: CAS\n";
        }
        for (int tid = 0; tid < numThreads; tid++) latencyThreads[tid].join();
        delete queue;

        // Aggregate all the delays for enqueues and dequeues and compute the maxs
        cout << "Aggregating delays for " << kLatencyMeasures/1000000 << " million measurements...\n";
        vector<nanoseconds> aggEnqDelay(kLatencyMeasures);
        long long idx = 0;
        for (int it = 0; it < numThreads; it++) {
            for (int i = 0; i < kLatencyMeasures/numThreads; i++) {
                aggEnqDelay[idx] = enqDelays[it][i];
                idx++;
            }
        }
        vector<nanoseconds> aggDeqDelay(kLatencyMeasures);
        idx = 0;
        for (int it = 0; it < numThreads; it++) {
            for (int i = 0; i < kLatencyMeasures/numThreads; i++) {
                aggDeqDelay[idx] = deqDelays[it][i];
                idx++;
            }
        }

        // Sort the aggregated delays
        cout << "Sorting delays...\n";
        sort(aggEnqDelay.begin(), aggEnqDelay.end());
        sort(aggDeqDelay.begin(), aggDeqDelay.end());

        // Show the 50% (median), 90%, 99%, 99.9%, 99.99%, 99.999% and maximum in microsecond/nanoseconds units
        long per50000 = (long)(kLatencyMeasures*50000LL/100000LL);
        long per70000 = (long)(kLatencyMeasures*70000LL/100000LL);
        long per80000 = (long)(kLatencyMeasures*80000LL/100000LL);
        long per90000 = (long)(kLatencyMeasures*90000LL/100000LL);
        long per99000 = (long)(kLatencyMeasures*99000LL/100000LL);
        long per99900 = (long)(kLatencyMeasures*99900LL/100000LL);
        long per99990 = (long)(kLatencyMeasures*99990LL/100000LL);
        long per99999 = (long)(kLatencyMeasures*99999LL/100000LL);
        long imax = kLatencyMeasures-1;

        cout << "Enqueue delay (us): 50%=" << aggEnqDelay[per50000].count()/1000 << "  70%=" << aggEnqDelay[per70000].count()/1000 << "  80%=" << aggEnqDelay[per80000].count()/1000
             << "  90%=" << aggEnqDelay[per90000].count()/1000 << "  99%=" << aggEnqDelay[per99000].count()/1000
             << "  99.9%=" << aggEnqDelay[per99900].count()/1000 << "  99.99%=" << aggEnqDelay[per99990].count()/1000
             << "  99.999%=" << aggEnqDelay[per99999].count()/1000 << "  max=" << aggEnqDelay[imax].count()/1000 << "\n";
        cout << "Dequeue delay (us): 50%=" << aggDeqDelay[per50000].count()/1000 << "  70%=" << aggDeqDelay[per70000].count()/1000 << "  80%=" << aggDeqDelay[per80000].count()/1000
             << "  90%=" << aggDeqDelay[per90000].count()/1000 << "  99%=" << aggDeqDelay[per99000].count()/1000
             << "  99.9%=" << aggDeqDelay[per99900].count()/1000 << "  99.99%=" << aggDeqDelay[per99990].count()/1000
             << "  99.999%=" << aggDeqDelay[per99999].count()/1000 << "  max=" << aggDeqDelay[imax].count()/1000 << "\n";

        // Show in csv format
        cout << "Enqueue delay (us):\n";
        cout << "50, " << aggEnqDelay[per50000].count()/1000 << "\n";
        cout << "90, " << aggEnqDelay[per90000].count()/1000 << "\n";
        cout << "99, " << aggEnqDelay[per99000].count()/1000 << "\n";
        cout << "99.9, " << aggEnqDelay[per99900].count()/1000 << "\n";
        cout << "99.99, " << aggEnqDelay[per99990].count()/1000 << "\n";
        cout << "99.999, " << aggEnqDelay[per99999].count()/1000 << "\n";
        cout << "Dequeue delay (us):\n";
        cout << "50, " << aggDeqDelay[per50000].count()/1000 << "\n";
        cout << "90, " << aggDeqDelay[per90000].count()/1000 << "\n";
        cout << "99, " << aggDeqDelay[per99000].count()/1000 << "\n";
        cout << "99.9, " << aggDeqDelay[per99900].count()/1000 << "\n";
        cout << "99.99, " << aggDeqDelay[per99990].count()/1000 << "\n";
        cout << "99.999, " << aggDeqDelay[per99999].count()/1000 << "\n";

        // Cleanup
        for (int it = 0; it < numThreads; it++) {
            delete[] enqDelays[it];
            delete[] deqDelays[it];
        }
    }


#ifdef NEVER
public:

    static void allLatencyTests() {
        // Burst Latency benchmarks
        //vector<int> threadList = { 30, 30, 30, 30, 30, 30, 30 }; // For the latency table in the paper
        //vector<int> threadList = { 4 };
        vector<int> threadList = { 1, 2, 4, 8, 12, 16, 20, 24, 28, 30, 32 };

        for (int nThreads : threadList) {
            BenchmarkLatencyQ bench(nThreads, 0, 0s); // Only the numThreads is used in this test
            std::cout << "\n----- Burst Latency   numThreads=" << bench.numThreads << "   kLatencyMeasures=" << kLatencyMeasures/1000000LL << "M -----\n";
            bench.latencyBurstBenchmark<MichaelScottQueue<UserData>>();
        }
        for (int nThreads : threadList) {
            BenchmarkLatencyQ bench(nThreads, 0, 0s); // Only the numThreads is used in this test
            std::cout << "\n----- Burst Latency   numThreads=" << bench.numThreads << "   kLatencyMeasures=" << kLatencyMeasures/1000000LL << "M -----\n";
            bench.latencyBurstBenchmark<BitNextQueue<UserData>>();
        }
        for (int nThreads : threadList) {
            BenchmarkLatencyQ bench(nThreads, 0, 0s); // Only the numThreads is used in this test
            std::cout << "\n----- Burst Latency   numThreads=" << bench.numThreads << "   kLatencyMeasures=" << kLatencyMeasures/1000000LL << "M -----\n";
            bench.latencyBurstBenchmark<BitNextLazyHeadQueue<UserData>>();
        }
        /*
        for (int nThreads : threadList) {
            BenchmarkLatencyQ bench(nThreads, 0, 0s); // Only the numThreads is used in this test
            std::cout << "\n----- Burst Latency   numThreads=" << bench.numThreads << "   kLatencyMeasures=" << kLatencyMeasures/1000000LL << "M -----\n";
            bench.latencyBurstBenchmark<KoganPetrankQueueCHP<UserData>>();
        }
        for (int nThreads : threadList) {
            BenchmarkLatencyQ bench(nThreads, 0, 0s); // Only the numThreads is used in this test
            std::cout << "\n----- Burst Latency   numThreads=" << bench.numThreads << "   kLatencyMeasures=" << kLatencyMeasures/1000000LL << "M -----\n";
            bench.latencyBurstBenchmark<CRTurnQueue<UserData>>();
        }
        */
    }
#endif
};

#endif


================================================
FILE: graphs/BenchmarkMaps.hpp
================================================
/*
 * Copyright 2017-2018
 *   Andreia Correia <andreia.veiga@unine.ch>
 *   Pedro Ramalhete <pramalhe@gmail.com>
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Nachshon Cohen <nachshonc@gmail.com>
 *
 * This work is published under the MIT license. See LICENSE.txt
 */
#ifndef _BENCHMARK_MAPS_H_
#define _BENCHMARK_MAPS_H_

#include <atomic>
#include <chrono>
#include <thread>
#include <string>
#include <vector>
#include <algorithm>
#include <iostream>

using namespace std;
using namespace chrono;

// Regular UserData
struct UserData  {
    long long seq;
    int tid;
    UserData(long long lseq, int ltid=0) {
        this->seq = lseq;
        this->tid = ltid;
    }
    UserData() {
        this->seq = -2;
        this->tid = -2;
    }
    UserData(const UserData &other) : seq(other.seq), tid(other.tid) { }

    bool operator < (const UserData& other) const {
        return seq < other.seq;
    }
    bool operator == (const UserData& other) const {
        return seq == other.seq && tid == other.tid;
    }
    bool operator != (const UserData& other) const {
        return seq != other.seq || tid != other.tid;
    }
};


namespace std {
    template <>
    struct hash<UserData> {
        std::size_t operator()(const UserData& k) const {
            using std::size_t;
            using std::hash;
            return (hash<long long>()(k.seq));  // This hash has no collisions, which is irealistic
        }
    };
}


/**
 * This is a micro-benchmark of sets, used in the CX paper
 */
class BenchmarkMaps {

private:
    struct Result {
        nanoseconds nsEnq = 0ns;
        nanoseconds nsDeq = 0ns;
        long long numEnq = 0;
        long long numDeq = 0;
        long long totOpsSec = 0;

        Result() { }

        Result(const Result &other) {
            nsEnq = other.nsEnq;
            nsDeq = other.nsDeq;
            numEnq = other.numEnq;
            numDeq = other.numDeq;
            totOpsSec = other.totOpsSec;
        }

        bool operator < (const Result& other) const {
            return totOpsSec < other.totOpsSec;
        }
    };

    static const long long NSEC_IN_SEC = 1000000000LL;

    int numThreads;

public:
    BenchmarkMaps(int numThreads) {
        this->numThreads = numThreads;
    }


    /**
     * When doing "updates" we execute a random removal and if the removal is successful we do an add() of the
     * same item immediately after. This keeps the size of the data structure equal to the original size (minus
     * MAX_THREADS items at most) which gives more deterministic results.
     */
    template<template<typename,typename> class S, typename K, typename V>
    long long benchmark(const int updateRatio, const seconds testLengthSeconds, const int numRuns, const int numElements, const bool dedicated=false) {
        long long ops[numThreads][numRuns];
        long long lengthSec[numRuns];
        atomic<bool> quit = { false };
        atomic<bool> startFlag = { false };
        S<K,V>* set = nullptr;
#ifdef TINY_STM
        stm_init_thread();
        //const int tid = 0;
        //WRITE_TX_BEGIN
        //set = TM_ALLOC<S>();
        //WRITE_TX_END
#endif

        // Create all the keys and values in the concurrent set
        K** keyarray = new K*[numElements];
        for (int i = 0; i < numElements; i++) keyarray[i] = new K(i);
        V** valarray = new V*[numElements];
        for (int i = 0; i < numElements; i++) valarray[i] = new V(i);

        // Can either be a Reader or a Writer
        auto rw_lambda = [&](const int updateRatio, long long *ops, const int tid) {
        	uint64_t accum = 0;
            long long numOps = 0;
#ifdef TINY_STM
            stm_init_thread();
#endif
            while (!startFlag.load()) ; // spin
            uint64_t seed = tid+1234567890123456781ULL;
            while (!quit.load()) {
                seed = randomLong(seed);
                int update = seed%1000;
                seed = randomLong(seed);
                auto ix = (unsigned int)(seed%numElements);
                if (update < updateRatio) {
                    // I'm a Writer
                    if (set->remove(*keyarray[ix])) {
                    	numOps++;
                    	set->put(*keyarray[ix], *valarray[ix]);
                    }
                    numOps++;
                } else {
                	// I'm a Reader
                    set->get(*keyarray[ix]);
                    seed = randomLong(seed);
                    ix = (unsigned int)(seed%numElements);
                    set->get(*keyarray[ix]);
                    numOps+=2;
                }

            }
            *ops = numOps;
#ifdef TINY_STM
            stm_exit_thread();
#endif
        };

        for (int irun = 0; irun < numRuns; irun++) {
            set = new S<K,V>();
            // Add all the items to the list
            set->addAll(keyarray, valarray, numElements);
            if (irun == 0) std::cout << "##### " << set->className() << " #####  \n";
            thread rwThreads[numThreads];
            if (dedicated) {
                rwThreads[0] = thread(rw_lambda, 1000, &ops[0][irun], 0);
                rwThreads[1] = thread(rw_lambda, 1000, &ops[1][irun], 1);
                for (int tid = 2; tid < numThreads; tid++) rwThreads[tid] = thread(rw_lambda, updateRatio, &ops[tid][irun], tid);
            } else {
                for (int tid = 0; tid < numThreads; tid++) rwThreads[tid] = thread(rw_lambda, updateRatio, &ops[tid][irun], tid);
            }
            this_thread::sleep_for(100ms);
            auto startBeats = steady_clock::now();
            startFlag.store(true);
            // Sleep for testLengthSeconds seconds
            this_thread::sleep_for(testLengthSeconds);
            quit.store(true);
            auto stopBeats = steady_clock::now();
            for (int tid = 0; tid < numThreads; tid++) rwThreads[tid].join();
            lengthSec[irun] = (stopBeats-startBeats).count();
            if (dedicated) {
                // We don't account for the write-only operations but we aggregate the values from the two threads and display them
                std::cout << "Mutative transactions per second = " << (ops[0][irun] + ops[1][irun])*1000000000LL/lengthSec[irun] << "\n";
                ops[0][irun] = 0;
                ops[1][irun] = 0;
            }
            quit.store(false);
            startFlag.store(false);
            // Measure the time the destructor takes to complete and if it's more than 1 second, print it out
            auto startDel = steady_clock::now();
#ifdef TINY_STM
            WRITE_TX_BEGIN
            TM_FREE<S>(set);
            WRITE_TX_END
#endif
            delete set;

            auto stopDel = steady_clock::now();
            if ((startDel-stopDel).count() > NSEC_IN_SEC) {
                std::cout << "Destructor took " << (startDel-stopDel).count()/NSEC_IN_SEC << " seconds\n";
            }
            // Compute ops at the end of each run
            long long agg = 0;
            for (int tid = 0; tid < numThreads; tid++) {
                agg += ops[tid][irun]*1000000000LL/lengthSec[irun];
            }
        }

        for (int i = 0; i < numElements; i++) delete keyarray[i];
        delete[] keyarray;

        // Accounting
        vector<long long> agg(numRuns);
        for (int irun = 0; irun < numRuns; irun++) {
            for (int tid = 0; tid < numThreads; tid++) {
                agg[irun] += ops[tid][irun]*1000000000LL/lengthSec[irun];
            }
        }

        // Compute the median. numRuns must be an odd number
        sort(agg.begin(),agg.end());
        auto maxops = agg[numRuns-1];
        auto minops = agg[0];
        auto medianops = agg[numRuns/2];
        auto delta = (long)(100.*(maxops-minops) / ((double)medianops));
        // Printed value is the median of the number of ops per second that all threads were able to accomplish (on average)
        std::cout << "Ops/sec = " << medianops << "      delta = " << delta << "%   min = " << minops << "   max = " << maxops << "\n";
#ifdef TINY_STM
        stm_exit_thread();
#endif
        return medianops;
    }


    /**
     * An imprecise but fast random number generator
     */
    uint64_t randomLong(uint64_t x) {
        x ^= x >> 12; // a
        x ^= x << 25; // b
        x ^= x >> 27; // c
        return x * 2685821657736338717LL;
    }

};

#endif


================================================
FILE: graphs/BenchmarkQueues.hpp
================================================
/*
 * Copyright 2017-2018
 *   Andreia Correia <andreia.veiga@unine.ch>
 *   Pedro Ramalhete <pramalhe@gmail.com>
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Nachshon Cohen <nachshonc@gmail.com>
 *
 * This work is published under the MIT license. See LICENSE.txt
 */
#ifndef _BENCHMARK_Q_H_
#define _BENCHMARK_Q_H_

#include <atomic>
#include <chrono>
#include <thread>
#include <string>
#include <vector>
#include <algorithm>
#include <cassert>


using namespace std;
using namespace chrono;

struct UserData  {
    long long seq;
    int tid;
    UserData(long long lseq, int ltid) {
        this->seq = lseq;
        this->tid = ltid;
    }
    UserData() {
        this->seq = -2;
        this->tid = -2;
    }
    UserData(const UserData &other) : seq(other.seq), tid(other.tid) { }

    bool operator < (const UserData& other) const {
        return seq < other.seq;
    }
};


/**
 * This is a micro-benchmark to run the tests shown in CRTurnQueue paper
 *
 * <h2> Performance Benchmarks </h2>
 * TODO
 *
 *
 * <h2> Latency Distribution </h2>
 *
 * - We fire up 28 threads of type LatencyThread;
 * - Each thread does alternatively 1000 enqueue() and 1000 dequeue(). All dequeues are non-null;
 * - After start, each thread does 1M iterations as warmup.
 * - Measurements are done for 4M iterations, that are saved in a local array, 2M enqueue and 2M dequeue;
 * -
 *
 */
class BenchmarkQueues {

private:

    struct Result {
        nanoseconds nsEnq = 0ns;
        nanoseconds nsDeq = 0ns;
        long long numEnq = 0;
        long long numDeq = 0;
        long long totOpsSec = 0;

        Result() { }

        Result(const Result &other) {
            nsEnq = other.nsEnq;
            nsDeq = other.nsDeq;
            numEnq = other.numEnq;
            numDeq = other.numDeq;
            totOpsSec = other.totOpsSec;
        }

        bool operator < (const Result& other) const {
            return totOpsSec < other.totOpsSec;
        }
    };

    // Performance benchmark constants
    static const long long kNumPairsWarmup =     1000000LL;     // Each threads does 1M iterations as warmup

    // Contants for Ping-Pong performance benchmark
    static const int kPingPongBatch = 1000;            // Each thread starts by injecting 1k items in the queue


    static const long long NSEC_IN_SEC = 1000000000LL;

    int numThreads;

public:

    BenchmarkQueues(int numThreads) {
        this->numThreads = numThreads;
    }


    /**
     * enqueue-dequeue pairs: in each iteration a thread executes an enqueue followed by a dequeue;
     * the benchmark executes 10^8 pairs partitioned evenly among all threads;
     */
    template<typename Q>
    uint64_t enqDeq(std::string& className, const long numPairs, const int numRuns) {
        nanoseconds deltas[numThreads][numRuns];
        atomic<bool> startFlag = { false };
        Q* queue = nullptr;
        className = Q::className();
        cout << "##### " << className << " #####  \n";

        auto enqdeq_lambda = [this,&startFlag,&numPairs,&queue](nanoseconds *delta, const int tid) {
            UserData ud(0,0);
            while (!startFlag.load()) {} // Spin until the startFlag is set
            // Warmup phase
            for (long long iter = 0; iter < kNumPairsWarmup/numThreads; iter++) {
                queue->enqueue(&ud, tid);
                if (queue->dequeue(tid) == nullptr) cout << "Error at warmup dequeueing iter=" << iter << "\n";
            }
            // Measurement phase
            auto startBeats = steady_clock::now();
            for (long long iter = 0; iter < numPairs/numThreads; iter++) {
                queue->enqueue(&ud, tid);
                if (queue->dequeue(tid) == nullptr) cout << "Error at measurement dequeueing iter=" << iter << "\n";
            }
            auto stopBeats = steady_clock::now();
            *delta = stopBeats - startBeats;
        };

        for (int irun = 0; irun < numRuns; irun++) {
            queue = new Q(numThreads);
            thread enqdeqThreads[numThreads];
            for (int tid = 0; tid < numThreads; tid++) enqdeqThreads[tid] = thread(enqdeq_lambda, &deltas[tid][irun], tid);
            startFlag.store(true);
            // Sleep for 2 seconds just to let the threads see the startFlag
            this_thread::sleep_for(2s);
            for (int tid = 0; tid < numThreads; tid++) enqdeqThreads[tid].join();
            startFlag.store(false);
            delete (Q*)queue;
        }

        // Sum up all the time deltas of all threads so we can find the median run
        vector<nanoseconds> agg(numRuns);
        for (int irun = 0; irun < numRuns; irun++) {
            agg[irun] = 0ns;
            for (int tid = 0; tid < numThreads; tid++) {
                agg[irun] += deltas[tid][irun];
            }
        }

        // Compute the median. numRuns should be an odd number
        sort(agg.begin(),agg.end());
        auto median = agg[numRuns/2].count()/numThreads; // Normalize back to per-thread time (mean of time for this run)

        cout << "Total Ops/sec = " << numPairs*2*NSEC_IN_SEC/median << "\n";
        return (numPairs*2*NSEC_IN_SEC/median);
    }


    /**
     * Start with only enqueues 100K/numThreads, wait for them to finish, then do only dequeues but only 100K/numThreads
     */
    template<typename Q>
    void burst(std::string& className, uint64_t& resultsEnq, uint64_t& resultsDeq,
               const long long burstSize, const int numIters, const int numRuns, const bool isSC=false) {
        Result results[numThreads][numRuns];
        atomic<bool> startEnq = { false };
        atomic<bool> startDeq = { false };
        atomic<long> barrier = { 0 };
        Q* queue = nullptr;

        auto burst_lambda = [this,&startEnq,&startDeq,&burstSize,&barrier,&numIters,&isSC,&queue](Result *res, const int tid) {
            UserData ud(0,0);
            // Warmup only if it is not Single-Consumer
            if (!isSC) {
                const long long warmupIters = 100000LL;  // Do 100K for each thread as a warmup
                for (long long iter = 0; iter < warmupIters; iter++) queue->enqueue(&ud, tid);
                for (long long iter = 0; iter < warmupIters; iter++) {
                    if (queue->dequeue(tid) == nullptr) cout << "ERROR: warmup dequeued nullptr in iter=" << iter << "\n";
                }
            }
            // Measurements
            for (int iter = 0; iter < numIters; iter++) {
                // Start with enqueues
                while (!startEnq.load()) {} // spin is better than yield here
                auto startBeats = steady_clock::now();
                for (long long i = 0; i < burstSize/numThreads; i++) {
                    queue->enqueue(&ud, tid);
                }
                auto stopBeats = steady_clock::now();
                res->nsEnq += (stopBeats-startBeats);
                res->numEnq += burstSize/numThreads;
                if (barrier.fetch_add(1) == numThreads) cout << "ERROR: in barrier\n";
                // dequeues
                while (!startDeq.load()) { } // spin is better than yield here
                if (isSC) { // Handle the single-consumer case
                    if (tid == 0) {
                        startBeats = steady_clock::now();
                        // We need to deal with rounding errors in the single-consumer case
                        for (long long i = 0; i < ((long long)(burstSize/numThreads))*numThreads; i++) {
                            if (queue->dequeue(tid) == nullptr) {
                                cout << "ERROR: dequeued nullptr in iter=" << i << "\n";
                                assert(false);
                            }
                        }
                        stopBeats = steady_clock::now();
                        if (queue->dequeue(tid) != nullptr) cout << "ERROR: dequeued non-null, there must be duplicate items!\n";
                        res->nsDeq += (stopBeats-startBeats);
                        res->numDeq += burstSize/numThreads;
                    }
                } else {
                    startBeats = steady_clock::now();
                    for (long long i = 0; i < burstSize/numThreads; i++) {
                        if (queue->dequeue(tid) == nullptr) {
                            cout << "ERROR: dequeued nullptr in iter=" << i << "\n";
                            assert(false);
                        }
                    }
                    stopBeats = steady_clock::now();
                    res->nsDeq += (stopBeats-startBeats);
                    res->numDeq += burstSize/numThreads;
                }
                if (barrier.fetch_add(1) == numThreads) cout << "ERROR: in barrier\n";
            }
        };

        for (int irun = 0; irun < numRuns; irun++) {
            queue = new Q(numThreads);
            if (irun == 0) {
                className = queue->className();
                cout << "##### " << queue->className() << " #####  \n";
            }
            thread burstThreads[numThreads];
            for (int tid = 0; tid < numThreads; tid++) burstThreads[tid] = thread(burst_lambda, &results[tid][irun], tid);
            this_thread::sleep_for(100ms);
            for (int iter=0; iter < numIters; iter++) {
                // enqueue round
                startEnq.store(true);
                while (barrier.load() != numThreads) this_thread::yield();
                startEnq.store(false);
                long tmp =  numThreads;
                if (!barrier.compare_exchange_strong(tmp, 0)) cout << "ERROR: CAS\n";
                // dequeue round
                startDeq.store(true);
                while (barrier.load() != numThreads) this_thread::yield();
                startDeq.store(false);
                tmp = numThreads;
                if (!barrier.compare_exchange_strong(tmp, 0)) cout << "ERROR: CAS\n";
            }
            for (int tid = 0; tid < numThreads; tid++) burstThreads[tid].join();
            delete queue;
        }

        // Accounting
        vector<Result> agg(numRuns);
        for (int irun = 0; irun < numRuns; irun++) {
            nanoseconds maxNsEnq = 0ns;
            nanoseconds maxNsDeq = 0ns;
            for (int tid = 0; tid < numThreads; tid++) {
                if (results[tid][irun].nsEnq > maxNsEnq) maxNsEnq = results[tid][irun].nsEnq;
                if (results[tid][irun].nsDeq > maxNsDeq) maxNsDeq = results[tid][irun].nsDeq;
                agg[irun].numEnq += results[tid][irun].numEnq;
                agg[irun].numDeq += results[tid][irun].numDeq;
            }
            agg[irun].nsEnq = maxNsEnq;
            agg[irun].nsDeq = maxNsDeq;
            agg[irun].totOpsSec = agg[irun].nsEnq.count()+agg[irun].nsDeq.count();
        }

        // Compute the median. numRuns should be an odd number
        sort(agg.begin(),agg.end());
        Result median = agg[numRuns/2];
        const long long allThreadsEnqPerSec = median.numEnq*NSEC_IN_SEC/median.nsEnq.count();
        const long long allThreadsDeqPerSec = median.numDeq*NSEC_IN_SEC/median.nsDeq.count();

        // Printed value is the median of the number of ops per second that all threads were able to accomplish (on average)
        cout << "Enq/sec = " << allThreadsEnqPerSec << "   Deq/sec = " << allThreadsDeqPerSec << "\n";
        resultsEnq = allThreadsEnqPerSec;
        resultsDeq = allThreadsDeqPerSec;
    }
};

#endif


================================================
FILE: graphs/BenchmarkSPS.hpp
================================================
/*
 * Copyright 2017-2018
 *   Andreia Correia <andreia.veiga@unine.ch>
 *   Pedro Ramalhete <pramalhe@gmail.com>
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Nachshon Cohen <nachshonc@gmail.com>
 *
 * This work is published under the MIT license. See LICENSE.txt
 */
#ifndef _BENCHMARK_SPS_H_
#define _BENCHMARK_SPS_H_

#include <atomic>
#include <chrono>
#include <thread>
#include <string>
#include <vector>
#include <algorithm>
#include <cassert>
#include <iostream>
#include <typeinfo>

static const long arraySize = 1000; // 1k or 1M entries in the SPS array


using namespace std;
using namespace chrono;


/**
 * This is a micro-benchmark
 */
class BenchmarkSPS {

private:
    int numThreads;

public:
    struct UserData  {
        long long seq;
        int tid;
        UserData(long long lseq, int ltid) {
            this->seq = lseq;
            this->tid = ltid;
        }
        UserData() {
            this->seq = -2;
            this->tid = -2;
        }
        UserData(const UserData &other) : seq(other.seq), tid(other.tid) { }

        bool operator < (const UserData& other) const {
            return seq < other.seq;
        }
    };

    BenchmarkSPS(int numThreads) {
        this->numThreads = numThreads;
    }


    /*
     * An array of integers that gets randomly permutated.
     */
    template<typename TM, template<typename> class TMTYPE>
    uint64_t benchmarkSPSInteger(std::string& className, const seconds testLengthSeconds, const long numSwapsPerTx, const int numRuns) {
        long long ops[numThreads][numRuns];
        long long lengthSec[numRuns];
        atomic<bool> startFlag = { false };
        atomic<bool> quit = { false };

        className = TM::className();
        cout << "##### " << TM::className() << " #####  \n";

        // Create the array of integers and initialize it
        TMTYPE<uint64_t>* parray;
        // It's ok to capture by reference, we're running single-threaded now
        TM::updateTx([&] () {
            parray = new TMTYPE<uint64_t>[arraySize];
            //parray = (TMTYPE<uint64_t>*)TM::tmMalloc(sizeof(TMTYPE<uint64_t>)*arraySize);
        } );
        // Break up the initialization into transactions of 1k stores, so it fits in the log
        for (long j = 0; j < arraySize; j+=1000) {
            TM::updateTx([&] () {
                for (int i = 0; i < 1000 && i+j < arraySize; i++) parray[i+j] = i+j;
            } );
        }

        auto func = [this,&startFlag,&quit,&numSwapsPerTx,&parray](long long *ops, const int tid) {
            uint64_t seed = tid+1234567890123456781ULL;
            // Spin until the startFlag is set
            while (!startFlag.load()) {}
            // Do transactions until the quit flag is set
            long long tcount = 0;
            while (!quit.load()) {
                TM::updateTx([&] () {
                    for (int i = 0; i < numSwapsPerTx; i++) {
                        seed = randomLong(seed);
                        auto ia = seed%arraySize;
                        uint64_t tmp = parray[ia];
                        seed = randomLong(seed);
                        auto ib = seed%arraySize;
                        parray[ia] = parray[ib];
                        parray[ib] = tmp;
                    }
                } );
                ++tcount;
                /*
                                PE::read_transaction([this,&seed,&parray,&numWordsPerTransaction] () {
                                    PersistentArrayInt<persist>* read_array = PE::template get_object<PersistentArrayInt<persist>>(PIDX_INT_ARRAY);
                                    // Check that the array is consistent
                                    int sum = 0;
                                    for (int i = 0; i < arraySize; i++) {
                                        sum += read_array->counters[i];
                                    }
                                    assert(sum == 0);
                                } );
                */
            }
            *ops = tcount;
        };
        for (int irun = 0; irun < numRuns; irun++) {
            if (irun == 0) className = TM::className();
            thread enqdeqThreads[numThreads];
            for (int tid = 0; tid < numThreads; tid++) enqdeqThreads[tid] = thread(func, &ops[tid][irun], tid);
            auto startBeats = steady_clock::now();
            startFlag.store(true);
            // Sleep for 20 seconds
            this_thread::sleep_for(testLengthSeconds);
            quit.store(true);
            auto stopBeats = steady_clock::now();
            for (int tid = 0; tid < numThreads; tid++) enqdeqThreads[tid].join();
            lengthSec[irun] = (stopBeats-startBeats).count();
            startFlag.store(false);
            quit.store(false);
        }

        // It's ok to capture by reference, we're running single-threaded now
        TM::updateTx([&] () {
            delete[] parray;
            //TM::tmFree(parray);
        });

        // Accounting
        vector<long long> agg(numRuns);
        for (int irun = 0; irun < numRuns; irun++) {
        	for(int i=0;i<numThreads;i++){
        		agg[irun] += ops[i][irun]*1000000000LL/lengthSec[irun];
        	}
        }
        // Compute the median. numRuns should be an odd number
        sort(agg.begin(),agg.end());
        auto maxops = agg[numRuns-1];
        auto minops = agg[0];
        auto medianops = agg[numRuns/2];
        auto delta = (long)(100.*(maxops-minops) / ((double)medianops));
        // Printed value is the median of the number of ops per second that all threads were able to accomplish (on average)
        std::cout << "Swaps/sec = " << medianops*numSwapsPerTx << "     delta = " << delta*numSwapsPerTx << "%   min = " << minops*numSwapsPerTx << "   max = " << maxops*numSwapsPerTx << "\n";
        return medianops*numSwapsPerTx;
    }


    /*
     * An array of objects that gets randomly permutated.
     */
    template<typename TM, template<typename> class TMTYPE, typename TMBASE>
    uint64_t benchmarkSPSObject(std::string& className, const seconds testLengthSeconds, const long numSwapsPerTx, const int numRuns) {
        long long ops[numThreads][numRuns];
        long long lengthSec[numRuns];
        atomic<bool> startFlag = { false };
        atomic<bool> quit = { false };

        struct MyObject : public TMBASE {
            uint64_t a {0};  // For the OneFile STMs these don't need to be tmtypes because they're immutable after visible in this benchmark
            uint64_t b {0};
            MyObject(uint64_t a0, uint64_t b0) {
                a = a0;
                b = b0;
            }
            MyObject(const MyObject &other) {
                a = other.a;
                b = other.b;
            }
        };

        // Create the array of integers and initialize it
        TMTYPE<MyObject*>* parray;
        parray = new TMTYPE<MyObject*>[arraySize];
        // Break up the initialization into transactions of 1k stores, so it fits in the log
        for (long j = 0; j < arraySize; j+=1000) {
            TM::updateTx([&] () {
                for (int i = 0; i < 1000 && i+j < arraySize; i++) parray[i+j] = TM::template tmNew<MyObject>((uint64_t)i+j,(uint64_t)i);
            } );
        }
        /*
         TM::updateTx([&] () {
            for (int i = 0; i < arraySize; i++) parray[i] = TM::template tmNew<MyObject>((uint64_t)i,(uint64_t)i);
        } );
        */


        auto func = [this,&startFlag,&quit,&numSwapsPerTx,&parray](long long *ops, const int tid) {
            uint64_t seed = tid+1234567890123456781ULL;
            // Spin until the startFlag is set
            while (!startFlag.load()) {}
            // Do transactions until the quit flag is set
            long long tcount = 0;
            while (!quit.load()) {
                TM::updateTx([&] () {
                    for (int i = 0; i < numSwapsPerTx; i++) {
                        seed = randomLong(seed);
                        auto ia = seed%arraySize;
                        // Create a new object with the same contents to replace the old object, at a random location
                        MyObject* tmp = TM::template tmNew<MyObject>(*parray[ia]);
                        TM::template tmDelete<MyObject>(parray[ia]);
                        parray[ia] = tmp;
                    }
                } );
                ++tcount;
            }
            *ops = tcount;
        };
        for (int irun = 0; irun < numRuns; irun++) {
            if (irun == 0) {
                className = TM::className();
                cout << "##### " << TM::className() << " #####  \n";
            }
            thread enqdeqThreads[numThreads];
            for (int tid = 0; tid < numThreads; tid++) enqdeqThreads[tid] = thread(func, &ops[tid][irun], tid);
            auto startBeats = steady_clock::now();
            startFlag.store(true);
            // Sleep for 20 seconds
            this_thread::sleep_for(testLengthSeconds);
            quit.store(true);
            auto stopBeats = steady_clock::now();
            for (int tid = 0; tid < numThreads; tid++) enqdeqThreads[tid].join();
            lengthSec[irun] = (stopBeats-startBeats).count();
            startFlag.store(false);
            quit.store(false);
        }
        TM::updateTx([&] () {
            for (int i = 0; i < arraySize; i++) TM::template tmDelete<MyObject>(parray[i]);
        });
        delete[] parray;

        // Accounting
        vector<long long> agg(numRuns);
        for (int irun = 0; irun < numRuns; irun++) {
            for(int i=0;i<numThreads;i++){
                agg[irun] += ops[i][irun]*1000000000LL/lengthSec[irun];
            }
        }
        // Compute the median. numRuns should be an odd number
        sort(agg.begin(),agg.end());
        auto maxops = agg[numRuns-1];
        auto minops = agg[0];
        auto medianops = agg[numRuns/2];
        auto delta = (long)(100.*(maxops-minops) / ((double)medianops));
        // Printed value is the median of the number of ops per second that all threads were able to accomplish (on average)
        std::cout << "Swaps/sec = " << medianops*numSwapsPerTx << "     delta = " << delta*numSwapsPerTx << "%   min = " << minops*numSwapsPerTx << "   max = " << maxops*numSwapsPerTx << "\n";
        return medianops*numSwapsPerTx;
    }


    /**
     * An imprecise but fast random number generator
     */
    uint64_t randomLong(uint64_t x) {
        x ^= x >> 12; // a
        x ^= x << 25; // b
        x ^= x >> 27; // c
        return x * 2685821657736338717LL;
    }
};

#endif


================================================
FILE: graphs/BenchmarkSets.hpp
================================================
/*
 * Copyright 2017-2018
 *   Andreia Correia <andreia.veiga@unine.ch>
 *   Pedro Ramalhete <pramalhe@gmail.com>
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Nachshon Cohen <nachshonc@gmail.com>
 *
 * This work is published under the MIT license. See LICENSE.txt
 */
#ifndef _BENCHMARK_SETS_H_
#define _BENCHMARK_SETS_H_

#include <atomic>
#include <chrono>
#include <thread>
#include <string>
#include <vector>
#include <algorithm>
#include <iostream>

using namespace std;
using namespace chrono;

// Regular UserData
struct UserData  {
    long long seq;
    int tid;
    UserData(long long lseq, int ltid=0) {
        this->seq = lseq;
        this->tid = ltid;
    }
    UserData() {
        this->seq = -2;
        this->tid = -2;
    }
    UserData(const UserData &other) : seq(other.seq), tid(other.tid) { }

    bool operator < (const UserData& other) const {
        return seq < other.seq;
    }
    bool operator == (const UserData& other) const {
        return seq == other.seq && tid == other.tid;
    }
    bool operator != (const UserData& other) const {
        return seq != other.seq || tid != other.tid;
    }
};


namespace std {
    template <>
    struct hash<UserData> {
        std::size_t operator()(const UserData& k) const {
            using std::size_t;
            using std::hash;
            return (hash<long long>()(k.seq));  // This hash has no collisions, which is irealistic
        }
    };
}


/**
 * This is a micro-benchmark of sets
 */
class BenchmarkSets {

private:
    struct Result {
        nanoseconds nsEnq = 0ns;
        nanoseconds nsDeq = 0ns;
        long long numEnq = 0;
        long long numDeq = 0;
        long long totOpsSec = 0;

        Result() { }

        Result(const Result &other) {
            nsEnq = other.nsEnq;
            nsDeq = other.nsDeq;
            numEnq = other.numEnq;
            numDeq = other.numDeq;
            totOpsSec = other.totOpsSec;
        }

        bool operator < (const Result& other) const {
            return totOpsSec < other.totOpsSec;
        }
    };

    static const long long NSEC_IN_SEC = 1000000000LL;

    int numThreads;

public:
    BenchmarkSets(int numThreads) {
        this->numThreads = numThreads;
    }


    /**
     * When doing "updates" we execute a random removal and if the removal is successful we do an add() of the
     * same item immediately after. This keeps the size of the data structure equal to the original size (minus
     * MAX_THREADS items at most) which gives more deterministic results.
     */
    template<typename S, typename K>
    long long benchmark(std::string& className, const int updateRatio, const seconds testLengthSeconds, const int numRuns, const int numElements, const bool dedicated=false) {
        long long ops[numThreads][numRuns];
        long long lengthSec[numRuns];
        atomic<bool> quit = { false };
        atomic<bool> startFlag = { false };

        className = S::className();
        std::cout << "##### " << S::className() << " #####  \n";
        S* set = new S(numThreads);
        // Create all the keys in the concurrent set
        K** udarray = new K*[numElements];
        for (int i = 0; i < numElements; i++) udarray[i] = new K(i);
        // Add all the items to the list
        set->addAll(udarray, numElements, 0);

        // Can either be a Reader or a Writer
        auto rw_lambda = [this,&quit,&startFlag,&set,&udarray,&numElements](const int updateRatio, long long *ops, const int tid) {
            long long numOps = 0;
            while (!startFlag.load()) ; // spin
            uint64_t seed = tid+1234567890123456781ULL;
            while (!quit.load()) {
                seed = randomLong(seed);
                int update = seed%1000;
                seed = randomLong(seed);
                auto ix = (unsigned int)(seed%numElements);
                if (update < updateRatio) {
                    // I'm a Writer
                    if (set->remove(*udarray[ix], tid)) {
                    	numOps++;
                    	set->add(*udarray[ix], tid);
                    }
                    numOps++;
                } else {
                	// I'm a Reader
                    set->contains(*udarray[ix], tid);
                    seed = randomLong(seed);
                    ix = (unsigned int)(seed%numElements);
                    set->contains(*udarray[ix], tid);
                    numOps += 2;
                }
            }
            *ops = numOps;
        };

        for (int irun = 0; irun < numRuns; irun++) {
            thread rwThreads[numThreads];
            if (dedicated) {
                rwThreads[0] = thread(rw_lambda, 1000, &ops[0][irun], 0);
                rwThreads[1] = thread(rw_lambda, 1000, &ops[1][irun], 1);
                for (int tid = 2; tid < numThreads; tid++) rwThreads[tid] = thread(rw_lambda, updateRatio, &ops[tid][irun], tid);
            } else {
                for (int tid = 0; tid < numThreads; tid++) rwThreads[tid] = thread(rw_lambda, updateRatio, &ops[tid][irun], tid);
            }
            this_thread::sleep_for(100ms);
            auto startBeats = steady_clock::now();
            startFlag.store(true);
            // Sleep for testLengthSeconds seconds
            this_thread::sleep_for(testLengthSeconds);
            quit.store(true);
            auto stopBeats = steady_clock::now();
            for (int tid = 0; tid < numThreads; tid++) rwThreads[tid].join();
            lengthSec[irun] = (stopBeats-startBeats).count();
            if (dedicated) {
                // We don't account for the write-only operations but we aggregate the values from the two threads and display them
                std::cout << "Mutative transactions per second = " << (ops[0][irun] + ops[1][irun])*1000000000LL/lengthSec[irun] << "\n";
                ops[0][irun] = 0;
                ops[1][irun] = 0;
            }
            quit.store(false);
            startFlag.store(false);
            // Compute ops at the end of each run
            long long agg = 0;
            for (int tid = 0; tid < numThreads; tid++) {
                agg += ops[tid][irun]*1000000000LL/lengthSec[irun];
            }
        }

        // Clear the set, one key at a time and then delete the instance
        for (int i = 0; i < numElements; i++) set->remove(*udarray[i], 0);
        delete set;

        for (int i = 0; i < numElements; i++) delete udarray[i];
        delete[] udarray;

        // Accounting
        vector<long long> agg(numRuns);
        for (int irun = 0; irun < numRuns; irun++) {
            for (int tid = 0; tid < numThreads; tid++) {
                agg[irun] += ops[tid][irun]*1000000000LL/lengthSec[irun];
            }
        }

        // Compute the median. numRuns must be an odd number
        sort(agg.begin(),agg.end());
        auto maxops = agg[numRuns-1];
        auto minops = agg[0];
        auto medianops = agg[numRuns/2];
        auto delta = (long)(100.*(maxops-minops) / ((double)medianops));
        // Printed value is the median of the number of ops per second that all threads were able to accomplish (on average)
        std::cout << "Ops/sec = " << medianops << "      delta = " << delta << "%   min = " << minops << "   max = " << maxops << "\n";
        return medianops;
    }


    /*
     * Inspired by Trevor Brown's benchmarks (does everyone else do it like this?)
     */
    template<typename S, typename K>
    long long benchmarkRandomFill(std::string& className, const int updateRatio, const seconds testLengthSeconds, const int numRuns, const int numElements, const bool dedicated=false) {
        long long ops[numThreads][numRuns];
        long long lengthSec[numRuns];
        atomic<bool> quit = { false };
        atomic<bool> startFlag = { false };

        className = S::className();
        std::cout << "##### " << S::className() << " #####  \n";
        S* set = new S(numThreads);
        // Create all the keys in the concurrent set
        K** udarray = new K*[2*numElements];
        for (int i = 0; i < 2*numElements; i++) udarray[i] = new K(i);
        // Add half the keys to the list
        long ielem = 0;
        uint64_t seed = 1234567890123456781ULL;
        while (ielem < numElements/2) {
            seed = randomLong(seed);
            // Insert new random keys until we have 'numElements/2' keys in the tree
            if (set->add(*udarray[seed%(numElements)], 0)) ielem++;
        }
        // Add all keys, repeating if needed
        set->addAll(udarray, numElements, 0);

        // Can either be a Reader or a Writer
        auto rw_lambda = [this,&quit,&startFlag,&set,&udarray,&numElements](const int updateRatio, long long *ops, const int tid) {
            long long numOps = 0;
            while (!startFlag.load()) ; // spin
            uint64_t seed = tid+1234567890123456781ULL;
            while (!quit.load()) {
                seed = randomLong(seed);
                int update = seed%1000;
                seed = randomLong(seed);
                auto ix = (unsigned int)(seed%numElements);
                if (update < updateRatio) {
                    // I'm a Writer
                    if (set->remove(*udarray[ix], tid)) {
                        numOps++;
                        set->add(*udarray[ix], tid);
                    }
                    numOps++;
                } else {
                    // I'm a Reader
                    set->contains(*udarray[ix], tid);
                    seed = randomLong(seed);
                    ix = (unsigned int)(seed%numElements);
                    set->contains(*udarray[ix], tid);
                    numOps += 2;
                }
            }
            *ops = numOps;
        };

        for (int irun = 0; irun < numRuns; irun++) {
            thread rwThreads[numThreads];
            if (dedicated) {
                rwThreads[0] = thread(rw_lambda, 1000, &ops[0][irun], 0);
                rwThreads[1] = thread(rw_lambda, 1000, &ops[1][irun], 1);
                for (int tid = 2; tid < numThreads; tid++) rwThreads[tid] = thread(rw_lambda, updateRatio, &ops[tid][irun], tid);
            } else {
                for (int tid = 0; tid < numThreads; tid++) rwThreads[tid] = thread(rw_lambda, updateRatio, &ops[tid][irun], tid);
            }
            this_thread::sleep_for(100ms);
            auto startBeats = steady_clock::now();
            startFlag.store(true);
            // Sleep for testLengthSeconds seconds
            this_thread::sleep_for(testLengthSeconds);
            quit.store(true);
            auto stopBeats = steady_clock::now();
            for (int tid = 0; tid < numThreads; tid++) rwThreads[tid].join();
            lengthSec[irun] = (stopBeats-startBeats).count();
            if (dedicated) {
                // We don't account for the write-only operations but we aggregate the values from the two threads and display them
                std::cout << "Mutative transactions per second = " << (ops[0][irun] + ops[1][irun])*1000000000LL/lengthSec[irun] << "\n";
                ops[0][irun] = 0;
                ops[1][irun] = 0;
            }
            quit.store(false);
            startFlag.store(false);
            // Compute ops at the end of each run
            long long agg = 0;
            for (int tid = 0; tid < numThreads; tid++) {
                agg += ops[tid][irun]*1000000000LL/lengthSec[irun];
            }
        }

        /* Clear the tree, one key at a time and then delete the instance */
        for (int i = 0; i < numElements; i++) set->remove(*udarray[i], 0);
        delete set;

        for (int i = 0; i < numElements; i++) delete udarray[i];
        delete[] udarray;

        // Accounting
        vector<long long> agg(numRuns);
        for (int irun = 0; irun < numRuns; irun++) {
            for (int tid = 0; tid < numThreads; tid++) {
                agg[irun] += ops[tid][irun]*1000000000LL/lengthSec[irun];
            }
        }

        // Compute the median. numRuns must be an odd number
        sort(agg.begin(),agg.end());
        auto maxops = agg[numRuns-1];
        auto minops = agg[0];
        auto medianops = agg[numRuns/2];
        auto delta = (long)(100.*(maxops-minops) / ((double)medianops));
        // Printed value is the median of the number of ops per second that all threads were able to accomplish (on average)
        std::cout << "Ops/sec = " << medianops << "      delta = " << delta << "%   min = " << minops << "   max = " << maxops << "\n";
        return medianops;
    }

    /**
     * An imprecise but fast random number generator
     */
    uint64_t randomLong(uint64_t x) {
        x ^= x >> 12; // a
        x ^= x << 25; // b
        x ^= x >> 27; // c
        return x * 2685821657736338717LL;
    }
};

#endif


================================================
FILE: graphs/Makefile
================================================
CXX = g++-8
CXXFLAGS = -std=c++17 -g -O2 -DPWB_IS_CLFLUSHOPT # -fuse-ld=gold -fsanitize=address
# For castor-1
#CXXFLAGS = -std=c++17 -g -O2 -DPWB_IS_CLWB -DPM_REGION_SIZE=64*1024*1024*1024ULL -DPM_USE_DAX -DPM_FILE_NAME="\"/mnt/pmem0/durable\""

# Possible options for PWB are:
# -DPWB_IS_CLFLUSH		pwb is a CLFLUSH and pfence/psync are nops      (Broadwell)
# -DPWB_IS_CLFLUSHOPT	pwb is a CLFLUSHOPT and pfence/psync are SFENCE (Kaby Lake) 
# -DPWB_IS_CLWB			pwb is a CLWB and pfence/psync are SFENCE       (Sky Lake SP, or Canon Lake SP and beyond)
# -DPWB_IS_NOP			pwb/pfence/psync are nops. Used for shared memory persistence


INCLUDES = -I../ -I../common/ 
#LIBS = -l/home/vagrant/tinystm/lib/libstm.a

# This library is needed for ESTM
ESTM_LIB = -L../stms/estm-0.3.0/lib/ -lstm -lpthread

# This library is needed for TinySTM
TINYSTM_LIB = -L../stms/tinystm/lib/ -lstm -lpthread
TINYSTM_INC = -I../stms/tinystm/

# This library is needed for PMDK
PMDKLIBS = -L/usr/local/lib -lpmemobj


BINARIES = \
	bin/sps-integer \
	bin/sps-integer-tiny \
	bin/sps-object \
	bin/sps-object-tiny \
	bin/set-ll-1k \
	bin/set-ll-1k-tiny \
	bin/set-ll-10k \
	bin/set-ll-10k-tiny \
	bin/set-tree-1k \
	bin/set-tree-1k-tiny \
	bin/set-tree-10k \
	bin/set-tree-10k-tiny \
	bin/set-tree-1m \
	bin/set-tree-1m-tiny \
	bin/set-hash-1k \
	bin/set-hash-1k-tiny \
	bin/q-ll-enq-deq \
	bin/q-ll-enq-deq-tiny \
	bin/q-array-enq-deq \
	bin/q-array-enq-deq-tiny \
	bin/psps-integer \
	bin/pset-ll-1k \
	bin/pset-ll-10k \
	bin/pset-hash-1k \
	bin/pset-tree-1k \
	bin/pq-ll-enq-deq \
	bin/latency-counter \
	bin/latency-counter-tiny \
    bin/pset-tree-1m-oflf \
    bin/pset-tree-1m-ofwf \
    bin/pset-tree-1m-pmdk \
	bin/pset-tree-1m-romlog \
	bin/pset-tree-1m-romlr \
#	bin/pset-tree-1m-pmdk \
#	bin/pread-while-writing-romlog \
	bin/pread-while-writing-romlr \
	bin/pread-while-writing-oflf \
	bin/pread-while-writing-ofwf \
	bin/pread-while-writing-pmdk \

STMS = \
	../stms/CRWWPSTM.hpp \
	../stms/OneFileLF.hpp \
	../stms/OneFileWF.hpp \
	../stms/TinySTM.hpp \
	../stms/tinystm/lib/libstm.a \

PTMS = \
	../ptms/OneFilePTMLF.hpp \
	../ptms/OneFilePTMWF.hpp \
	../ptms/PMDKTM.hpp \
	lib/libromulus.a \
    ../ptms/romuluslog/RomulusLog.hpp \
    ../ptms/romuluslr/RomulusLR.hpp \

SRC_LISTS = \
	../datastructures/linkedlists/CRWWPLinkedListSet.hpp \
	../datastructures/linkedlists/ESTMLinkedListSet.hpp \
	../datastructures/linkedlists/OFLFLinkedListSet.hpp \
	../datastructures/linkedlists/OFWFLinkedListSet.hpp \
	../datastructures/linkedlists/STMLinkedListSet.hpp \
	../datastructures/linkedlists/TinySTMLinkedListSet.hpp \
	
SRC_TREES = \
	../datastructures/treemaps/ESTMRedBlackTree.hpp \
	../datastructures/treemaps/NatarajanTreeHE.hpp \
	../datastructures/treemaps/OFLFRedBlackTree.hpp \
	../datastructures/treemaps/OFWFRedBlackTree.hpp \

QUEUES_DEP = \
	../datastructures/queues/ESTMArrayLinkedListQueue.hpp \
	../datastructures/queues/ESTMLinkedListQueue.hpp \
	../datastructures/queues/FAAArrayQueue.hpp \
	../datastructures/queues/LCRQueue.hpp \
	../datastructures/queues/MichaelScottQueue.hpp \
	../datastructures/queues/OFLFArrayLinkedListQueue.hpp \
	../datastructures/queues/OFLFLinkedListQueue.hpp \
	../datastructures/queues/OFWFArrayLinkedListQueue.hpp \
	../datastructures/queues/OFWFLinkedListQueue.hpp \
	../datastructures/queues/TurnQueue.hpp \
	
PQUEUES_DEP = \
	../pdatastructures/pqueues/MichaelScottQueue.hpp \
	../pdatastructures/pqueues/PFriedmanQueue.hpp \
	../pdatastructures/pqueues/PMDKLinkedListQueue.hpp \
	../pdatastructures/pqueues/PMichaelScottQueue.hpp \
	../pdatastructures/TMLinkedListQueue.hpp \
	../pdatastructures/pqueues/POFLFLinkedListQueue.hpp \
	../pdatastructures/pqueues/POFWFLinkedListQueue.hpp \
	../pdatastructures/pqueues/RomLogLinkedListQueue.hpp \
	../pdatastructures/pqueues/RomLRLinkedListQueue.hpp \

ROMULUS_LIB_SRC = \
	../common/ThreadRegistry.cpp \
	../ptms/romuluslog/malloc.cpp \
	../ptms/romuluslog/RomulusLog.cpp \
	../ptms/romuluslr/malloc.cpp \
	../ptms/romuluslr/RomulusLR.cpp \

ROMULUS_LIB_DEP = \
	$(ROMULUS_LIB_SRC) \
	../ptms/romuluslog/RomulusLog.hpp \
	../ptms/romuluslr/RomulusLR.hpp \
		
TREVOR_BROWN_INCLUDES = \
	-I../datastructures/trevor_brown_abtree/common/recordmgr \
	-I../datastructures/trevor_brown_abtree/common \
	-I../datastructures/trevor_brown_abtree/common/descriptors \
	-I../datastructures/trevor_brown_abtree/common/rq \
	-I../datastructures/trevor_brown_abtree/common/rq \
	-I../datastructures/trevor_brown_abtree/common/atomic_ops \

all: $(BINARIES) persistencyclean


clean: persistencyclean
	rm -f bin/*
	rm -f lib/*


persistencyclean:
	rm -f /dev/shm/*_shared
	rm -f /dev/shm/psegments/*


#
# Create a library for RomulusLog and RomulusLR
#
lib/threadregistry.o: $(ROMULUS_LIB_DEP)
	$(CXX) $(CXXFLAGS) $(INCLUDES) -c ../common/ThreadRegistry.cpp -o lib/threadregistry.o

lib/mallocromlog.o: $(ROMULUS_LIB_DEP)
	$(CXX) $(CXXFLAGS) $(INCLUDES) -c ../ptms/romuluslog/malloc.cpp -o lib/mallocromlog.o

lib/romlog.o: $(ROMULUS_LIB_DEP)
	$(CXX) $(CXXFLAGS) $(INCLUDES) -c ../ptms/romuluslog/RomulusLog.cpp -o lib/romlog.o

lib/mallocromlr.o: $(ROMULUS_LIB_DEP)
	$(CXX) $(CXXFLAGS) $(INCLUDES) -c ../ptms/romuluslr/malloc.cpp -o lib/mallocromlr.o

lib/romlr.o: $(ROMULUS_LIB_DEP)
	$(CXX) $(CXXFLAGS) $(INCLUDES) -c ../ptms/romuluslr/RomulusLR.cpp -o lib/romlr.o
	
lib/libromulus.a: lib/threadregistry.o lib/mallocromlog.o lib/romlog.o lib/mallocromlr.o lib/romlr.o
	ar rcs lib/libromulus.a lib/threadregistry.o lib/mallocromlog.o lib/romlog.o lib/mallocromlr.o lib/romlr.o


#
# Queues for volatile memory
#	
bin/q-ll-enq-deq: q-ll-enq-deq.cpp $(STMS) $(QUEUES_DEP)
	$(CXX) $(CXXFLAGS) $(INCLUDES) $(CSRCS) q-ll-enq-deq.cpp -o bin/q-ll-enq-deq -lpthread $(ESTM_LIB)
	
bin/q-array-enq-deq: q-array-enq-deq.cpp $(STMS) $(QUEUES_DEP)
	$(CXX) $(CXXFLAGS) $(INCLUDES) $(CSRCS) q-array-enq-deq.cpp -o bin/q-array-enq-deq -lpthread $(ESTM_LIB)
	
bin/q-ll-burst: q-ll-burst.cpp $(QUEUES_DEP)
	$(CXX) $(CXXFLAGS) $(INCLUDES) $(CSRCS) q-ll-burst.cpp -o bin/q-ll-burst -lpthread $(ESTM_LIB)

# Same as above but for TinySTM
bin/q-ll-enq-deq-tiny: q-ll-enq-deq.cpp $(STMS) $(QUEUES_DEP)
	$(CXX) $(CXXFLAGS) -DUSE_TINY $(INCLUDES) $(TINYSTM_INC) $(CSRCS) q-ll-enq-deq.cpp -o bin/q-ll-enq-deq-tiny -lpthread $(TINYSTM_LIB)

bin/q-array-enq-deq-tiny: q-array-enq-deq.cpp $(STMS) $(QUEUES_DEP)
	$(CXX) $(CXXFLAGS) -DUSE_TINY $(INCLUDES) $(TINYSTM_INC) $(CSRCS) q-array-enq-deq.cpp -o bin/q-array-enq-deq-tiny -lpthread $(TINYSTM_LIB)


#
# Queues for persistent memory
#	
bin/pq-ll-enq-deq: pq-ll-enq-deq.cpp $(PTMS) $(PQUEUES_DEP) PBenchmarkQueues.hpp
	$(CXX) $(CXXFLAGS) $(INCLUDES) pq-ll-enq-deq.cpp -o bin/pq-ll-enq-deq -lpthread $(PMDKLIBS) lib/libromulus.a

	
#
# Sets for volatile memory
#	
bin/set-ll-1k: set-ll-1k.cpp $(STMS) $(SRC_LISTS)
	$(CXX) $(CXXFLAGS) $(INCLUDES) $(CSRCS) set-ll-1k.cpp -o bin/set-ll-1k -lpthread $(ESTM_LIB)

bin/set-ll-10k: set-ll-10k.cpp $(STMS) $(SRC_LISTS)
	$(CXX) $(CXXFLAGS) $(INCLUDES) $(CSRCS) set-ll-10k.cpp -o bin/set-ll-10k -lpthread $(ESTM_LIB)

bin/set-tree-1k: set-tree-1k.cpp $(STMS) $(SRC_TREES)
	$(CXX) $(CXXFLAGS) -fuse-ld=gold -fsanitize=address $(INCLUDES) $(TREVOR_BROWN_INCLUDES) ../common/ThreadRegistry.cpp $(CSRCS) set-tree-1k.cpp -o bin/set-tree-1k -lpthread $(ESTM_LIB)

bin/set-tree-10k: set-tree-10k.cpp $(STMS) $(SRC_TREES)
	$(CXX) $(CXXFLAGS) $(INCLUDES) $(TREVOR_BROWN_INCLUDES) ../common/ThreadRegistry.cpp $(CSRCS) set-tree-10k.cpp -o bin/set-tree-10k -lpthread $(ESTM_LIB)

bin/set-tree-1m: set-tree-1m.cpp $(STMS) $(SRC_TREES)
	$(CXX) $(CXXFLAGS) $(INCLUDES) $(TREVOR_BROWN_INCLUDES) ../common/ThreadRegistry.cpp $(CSRCS) set-tree-1m.cpp -o bin/set-tree-1m -lpthread $(ESTM_LIB)

bin/set-hash-1k: set-hash-1k.cpp $(STMS)
	$(CXX) $(CXXFLAGS) $(INCLUDES) $(CSRCS) set-hash-1k.cpp -o bin/set-hash-1k -lpthread $(ESTM_LIB)


# Same as above, but for Tiny STM only
bin/set-ll-1k-tiny: set-ll-1k.cpp $(STMS) $(SRC_LISTS)
	$(CXX) $(CXXFLAGS) -DUSE_TINY $(INCLUDES) $(TINYSTM_INC) $(CSRCS) set-ll-1k.cpp -o bin/set-ll-1k-tiny -lpthread $(TINYSTM_LIB)

bin/set-ll-10k-tiny: set-ll-10k.cpp $(STMS) $(SRC_LISTS)
	$(CXX) $(CXXFLAGS) -DUSE_TINY $(INCLUDES) $(TINYSTM_INC) $(CSRCS) set-ll-10k.cpp -o bin/set-ll-10k-tiny -lpthread $(TINYSTM_LIB)

bin/set-tree-1k-tiny: set-tree-1k.cpp $(STMS)
	$(CXX) $(CXXFLAGS) -DUSE_TINY $(INCLUDES) $(TINYSTM_INC) $(CSRCS) set-tree-1k.cpp -o bin/set-tree-1k-tiny -lpthread $(TINYSTM_LIB)

bin/set-tree-10k-tiny: set-tree-10k.cpp $(STMS)
	$(CXX) $(CXXFLAGS) -DUSE_TINY $(INCLUDES) $(TINYSTM_INC) $(CSRCS) set-tree-10k.cpp -o bin/set-tree-10k-tiny -lpthread $(TINYSTM_LIB)

bin/set-tree-1m-tiny: set-tree-1m.cpp $(STMS)
	$(CXX) $(CXXFLAGS) -DUSE_TINY $(INCLUDES) $(TINYSTM_INC) $(CSRCS) set-tree-1m.cpp -o bin/set-tree-1m-tiny -lpthread $(TINYSTM_LIB)

bin/set-hash-1k-tiny: set-hash-1k.cpp $(STMS)
	$(CXX) $(CXXFLAGS) -DUSE_TINY $(INCLUDES) $(TINYSTM_INC) $(CSRCS) set-hash-1k.cpp -o bin/set-hash-1k-tiny -lpthread $(TINYSTM_LIB)


#
# Sets for persistent memory
#	
bin/pset-ll-1k: pset-ll-1k.cpp $(PTMS) PBenchmarkSets.hpp ../pdatastructures/TMLinkedListSet.hpp lib/libromulus.a
	$(CXX) $(CXXFLAGS) $(INCLUDES) pset-ll-1k.cpp -o bin/pset-ll-1k -lpthread $(PMDKLIBS) lib/libromulus.a

bin/pset-ll-10k: pset-ll-10k.cpp $(PTMS) PBenchmarkSets.hpp ../pdatastructures/TMLinkedListSet.hpp lib/libromulus.a
	$(CXX) $(CXXFLAGS) $(INCLUDES) pset-ll-10k.cpp -o bin/pset-ll-10k -lpthread $(PMDKLIBS) lib/libromulus.a

bin/pset-hash-1k: pset-hash-1k.cpp $(PTMS) PBenchmarkSets.hpp ../pdatastructures/TMHashMap.hpp lib/libromulus.a
	$(CXX) $(CXXFLAGS) $(INCLUDES) pset-hash-1k.cpp -o bin/pset-hash-1k -lpthread $(PMDKLIBS) lib/libromulus.a

bin/pset-tree-1k: pset-tree-1k.cpp $(PTMS) PBenchmarkSets.hpp ../pdatastructures/TMRedBlackTree.hpp lib/libromulus.a
	$(CXX) $(CXXFLAGS) $(INCLUDES) pset-tree-1k.cpp -o bin/pset-tree-1k -lpthread $(PMDKLIBS) lib/libromulus.a
	
bin/pset-tree-1m: pset-tree-1m.cpp $(PTMS) PBenchmarkSets.hpp ../pdatastructures/TMRedBlackTree.hpp lib/libromulus.a
	$(CXX) $(CXXFLAGS) $(INCLUDES) pset-tree-1m.cpp -o bin/pset-tree-1m -lpthread $(PMDKLIBS) lib/libromulus.a

	
#	
# SPS for volatile memory
#
bin/sps-integer: sps-integer.cpp $(STMS) BenchmarkSPS.hpp
	$(CXX) $(CXXFLAGS) $(INCLUDES) $(CSRCS) sps-integer.cpp -o bin/sps-integer -lpthread $(ESTM_LIB)

bin/sps-integer-tiny: sps-integer.cpp $(STMS) BenchmarkSPS.hpp
	$(CXX) $(CXXFLAGS) -DUSE_TINY $(INCLUDES) $(TINYSTM_INC) $(CSRCS) sps-integer.cpp -o bin/sps-integer-tiny -lpthread $(TINYSTM_LIB)

bin/sps-object: sps-object.cpp $(STMS) BenchmarkSPS.hpp
	$(CXX) $(CXXFLAGS) $(INCLUDES) $(CSRCS) sps-object.cpp -o bin/sps-object -lpthread $(ESTM_LIB)

bin/sps-object-tiny: sps-object.cpp $(STMS) BenchmarkSPS.hpp
	$(CXX) $(CXXFLAGS) -DUSE_TINY $(INCLUDES) $(TINYSTM_INC) $(CSRCS) sps-object.cpp -o bin/sps-object-tiny -lpthread $(TINYSTM_LIB)


#
# SPS for persistent memory
#
bin/psps-integer: psps-integer.cpp $(PTMS) PBenchmarkSPS.hpp lib/libromulus.a
	$(CXX) $(CXXFLAGS) $(INCLUDES) psps-integer.cpp -o bin/psps-integer -lpthread $(PMDKLIBS) lib/libromulus.a

bin/psps-integer-atlas: psps-integer-atlas.cpp $(PTMS) PBenchmarkSPS.hpp persistencyclean lib/libromulus.a
	$(CXX) $(CXXFLAGS) $(INCLUDES) psps-integer-atlas.cpp -o bin/psps-integer-atlas -lpthread lib/libromulus.a
	

# TODO: is it worth doing sps-object for PTMs ?


#
# Latency for STMs
#
bin/latency-counter: latency-counter.cpp $(STMS) BenchmarkLatencyCounter.hpp
	$(CXX) $(CXXFLAGS) $(INCLUDES) $(CSRCS) latency-counter.cpp -o bin/latency-counter -lpthread $(ESTM_LIB)

bin/latency-counter-tiny: latency-counter.cpp $(STMS) BenchmarkLatencyCounter.hpp
	$(CXX) $(CXXFLAGS) -DUSE_TINY $(INCLUDES) $(TINYSTM_INC) $(CSRCS) latency-counter.cpp -o bin/latency-counter-tiny -lpthread $(TINYSTM_LIB)


# 
# Persistent balanced tree with 1M keys. Must be compiled one at a time otherwise you get all the NVM heaps allocated, which is too much 
#
bin/pset-tree-1m-romlog: pset-tree-1m.cpp PBenchmarkSets.hpp ../pdatastructures/TMRedBlackTree.hpp lib/libromulus.a 
	$(CXX) $(CXXFLAGS) -DUSE_ROMLOG $(INCLUDES) pset-tree-1m.cpp -o bin/pset-tree-1m-romlog -lpthread lib/libromulus.a

bin/pset-tree-1m-romlr: pset-tree-1m.cpp PBenchmarkSets.hpp ../pdatastructures/TMRedBlackTree.hpp lib/libromulus.a 
	$(CXX) $(CXXFLAGS) -DUSE_ROMLR $(INCLUDES) pset-tree-1m.cpp -o bin/pset-tree-1m-romlr -lpthread lib/libromulus.a

bin/pset-tree-1m-oflf: pset-tree-1m.cpp PBenchmarkSets.hpp ../pdatastructures/TMRedBlackTree.hpp ../ptms/OneFilePTMLF.hpp
	$(CXX) $(CXXFLAGS) -DUSE_OFLF $(INCLUDES) pset-tree-1m.cpp -o bin/pset-tree-1m-oflf -lpthread

bin/pset-tree-1m-ofwf: pset-tree-1m.cpp PBenchmarkSets.hpp ../pdatastructures/TMRedBlackTree.hpp ../ptms/OneFilePTMWF.hpp
	$(CXX) $(CXXFLAGS) -DUSE_OFWF $(INCLUDES) pset-tree-1m.cpp -o bin/pset-tree-1m-ofwf -lpthread

bin/pset-tree-1m-pmdk: pset-tree-1m.cpp PBenchmarkSets.hpp ../pdatastructures/TMRedBlackTree.hpp 
	$(CXX) $(CXXFLAGS) -DUSE_PMDK $(INCLUDES) pset-tree-1m.cpp -o bin/pset-tree-1m-pmdk -lpthread $(PMDKLIBS)
	

# experimental...
bin/pread-while-writing-romlog: pread-while-writing.cpp PBenchmarkSets.hpp ../pdatastructures/TMRedBlackTree.hpp lib/libromulus.a 
	$(CXX) $(CXXFLAGS) -DUSE_ROMLOG $(INCLUDES) pread-while-writing.cpp -o bin/pread-while-writing-romlog -lpthread lib/libromulus.a

bin/pread-while-writing-romlr: pread-while-writing.cpp PBenchmarkSets.hpp ../pdatastructures/TMRedBlackTree.hpp lib/libromulus.a 
	$(CXX) $(CXXFLAGS) -DUSE_ROMLR $(INCLUDES) pread-while-writing.cpp -o bin/pread-while-writing-romlr -lpthread lib/libromulus.a

bin/pread-while-writing-oflf: pread-while-writing.cpp PBenchmarkSets.hpp ../pdatastructures/TMRedBlackTree.hpp ../ptms/OneFilePTMLF.hpp
	$(CXX) $(CXXFLAGS) -DUSE_OFLF $(INCLUDES) pread-while-writing.cpp -o bin/pread-while-writing-oflf -lpthread

bin/pread-while-writing-ofwf: pread-while-writing.cpp PBenchmarkSets.hpp ../pdatastructures/TMRedBlackTree.hpp ../ptms/OneFilePTMWF.hpp
	$(CXX) $(CXXFLAGS) -DUSE_OFWF $(INCLUDES) pread-while-writing.cpp -o bin/pread-while-writing-ofwf -lpthread

bin/pread-while-writing-pmdk: pread-while-writing.cpp PBenchmarkSets.hpp ../pdatastructures/TMRedBlackTree.hpp 
	$(CXX) $(CXXFLAGS) -DUSE_PMDK $(INCLUDES) pread-while-writing.cpp -o bin/pread-while-writing-pmdk -lpthread $(PMDKLIBS)
	

================================================
FILE: graphs/PBenchmarkQueues.hpp
================================================
/*
 * Copyright 2017-2018
 *   Andreia Correia <andreia.veiga@unine.ch>
 *   Pedro Ramalhete <pramalhe@gmail.com>
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Nachshon Cohen <nachshonc@gmail.com>
 *
 * This work is published under the MIT license. See LICENSE.txt
 */
#ifndef _PERSISTENT_BENCHMARK_Q_H_
#define _PERSISTENT_BENCHMARK_Q_H_

#include <atomic>
#include <chrono>
#include <thread>
#include <string>
#include <vector>
#include <algorithm>
#include <cassert>


using namespace std;
using namespace chrono;

struct UserData  {
    long long seq;
    int tid;
    UserData(long long lseq, int ltid) {
        this->seq = lseq;
        this->tid = ltid;
    }
    UserData() {
        this->seq = -2;
        this->tid = -2;
    }
    UserData(const UserData &other) : seq(other.seq), tid(other.tid) { }

    bool operator < (const UserData& other) const {
        return seq < other.seq;
    }
};


/**
 * This is a micro-benchmark for persistent queues
 */
class PBenchmarkQueues {

private:

    struct Result {
        nanoseconds nsEnq = 0ns;
        nanoseconds nsDeq = 0ns;
        long long numEnq = 0;
        long long numDeq = 0;
        long long totOpsSec = 0;

        Result() { }

        Result(const Result &other) {
            nsEnq = other.nsEnq;
            nsDeq = other.nsDeq;
            numEnq = other.numEnq;
            numDeq = other.numDeq;
            totOpsSec = other.totOpsSec;
        }

        bool operator < (const Result& other) const {
            return totOpsSec < other.totOpsSec;
        }
    };

    // Performance benchmark constants
    static const long long kNumPairsWarmup =     1000000LL;     // Each threads does 1M iterations as warmup

    // Contants for Ping-Pong performance benchmark
    static const int kPingPongBatch = 1000;            // Each thread starts by injecting 1k items in the queue


    static const long long NSEC_IN_SEC = 1000000000LL;

    int numThreads;

public:

    PBenchmarkQueues(int numThreads) {
        this->numThreads = numThreads;
    }


    /**
     * enqueue-dequeue pairs: in each iteration a thread executes an enqueue followed by a dequeue;
     * the benchmark executes 10^8 pairs partitioned evenly among all threads;
     * WARNING: If you modify this, please modify enqDeqNoTransaction() also
     */
    template<typename Q, typename PTM>
    uint64_t enqDeq(std::string& className, const long numPairs, const int numRuns) {
        nanoseconds deltas[numThreads][numRuns];
        atomic<bool> startFlag = { false };
        Q* queue = nullptr;
        className = Q::className();
        cout << "##### " << className << " #####  \n";

        auto enqdeq_lambda = [this,&startFlag,&numPairs,&queue](nanoseconds *delta, const int tid) {
            //UserData* ud = new UserData{0,0};
            uint64_t* ud = new uint64_t(42);
            while (!startFlag.load()) {} // Spin until the startFlag is set
            // Warmup phase
            for (long long iter = 0; iter < numPairs/(numThreads*10); iter++) { // Do 1/10 iterations as warmup
                PTM::updateTx([=] () {
                    queue->enqueue(*ud, tid);
                    if (queue->dequeue(tid) == queue->EMPTY) cout << "Error at warmup dequeueing iter=" << iter << "\n";
                });
            }
            // Measurement phase
            auto startBeats = steady_clock::now();
            for (long long iter = 0; iter < numPairs/numThreads; iter++) {
                PTM::updateTx([=] () {
                    queue->enqueue(*ud, tid);
                    if (queue->dequeue(tid) == queue->EMPTY) cout << "Error at measurement dequeueing iter=" << iter << "\n";
                });
            }
            auto stopBeats = steady_clock::now();
            *delta = stopBeats - startBeats;
        };

        for (int irun = 0; irun < numRuns; irun++) {
            PTM::updateTx([&] () { // It's ok to capture by reference, only the main thread is active (but it is not ok for CX-PTM)
                queue = PTM::template tmNew<Q>();
            });
            thread enqdeqThreads[numThreads];
            for (int tid = 0; tid < numThreads; tid++) enqdeqThreads[tid] = thread(enqdeq_lambda, &deltas[tid][irun], tid);
            startFlag.store(true);
            // Sleep for 2 seconds just to let the threads see the startFlag
            this_thread::sleep_for(2s);
            for (int tid = 0; tid < numThreads; tid++) enqdeqThreads[tid].join();
            startFlag.store(false);
            PTM::updateTx([=] () {
                PTM::tmDelete(queue);
            });
        }

        // Sum up all the time deltas of all threads so we can find the median run
        vector<nanoseconds> agg(numRuns);
        for (int irun = 0; irun < numRuns; irun++) {
            agg[irun] = 0ns;
            for (int tid = 0; tid < numThreads; tid++) {
                agg[irun] += deltas[tid][irun];
            }
        }

        // Compute the median. numRuns should be an odd number
        sort(agg.begin(),agg.end());
        auto median = agg[numRuns/2].count()/numThreads; // Normalize back to per-thread time (mean of time for this run)

        cout << "Total Ops/sec = " << numPairs*2*NSEC_IN_SEC/median << "\n";
        return (numPairs*2*NSEC_IN_SEC/median);
    }


    /*
     * WARNING: If you modify this, please modify enqDeq() also
     */
    template<typename Q>
    uint64_t enqDeqNoTransaction(std::string& className, const long numPairs, const int numRuns) {
        nanoseconds deltas[numThreads][numRuns];
        atomic<bool> startFlag = { false };
        Q* queue = nullptr;
        className = Q::className();
        cout << "##### " << className << " #####  \n";

        auto enqdeq_lambda = [this,&startFlag,&numPairs,&queue](nanoseconds *delta, const int tid) {
            uint64_t* ud = new uint64_t(42);
            while (!startFlag.load()) {} // Spin until the startFlag is set
            // Warmup phase
            for (long long iter = 0; iter < numPairs/(numThreads*10); iter++) { // Do 1/10 iterations as warmup
                queue->enqueue(*ud, tid);
                if (queue->dequeue(tid) == queue->EMPTY) cout << "Error at warmup dequeueing iter=" << iter << "\n";
            }
            // Measurement phase
            auto startBeats = steady_clock::now();
            for (long long iter = 0; iter < numPairs/numThreads; iter++) {
                queue->enqueue(*ud, tid);
                if (queue->dequeue(tid) == queue->EMPTY) cout << "Error at measurement dequeueing iter=" << iter << "\n";
            }
            auto stopBeats = steady_clock::now();
            *delta = stopBeats - startBeats;
        };

        for (int irun = 0; irun < numRuns; irun++) {
            queue = new Q();  // TODO: use a PTM allocator, maybe the one in PMDK
            thread enqdeqThreads[numThreads];
            for (int tid = 0; tid < numThreads; tid++) enqdeqThreads[tid] = thread(enqdeq_lambda, &deltas[tid][irun], tid);
            startFlag.store(true);
            // Sleep for 2 seconds just to let the threads see the startFlag
            this_thread::sleep_for(2s);
            for (int tid = 0; tid < numThreads; tid++) enqdeqThreads[tid].join();
            startFlag.store(false);
            delete queue;   // TODO: use PTM de-allocator
        }

        // Sum up all the time deltas of all threads so we can find the median run
        vector<nanoseconds> agg(numRuns);
        for (int irun = 0; irun < numRuns; irun++) {
            agg[irun] = 0ns;
            for (int tid = 0; tid < numThreads; tid++) {
                agg[irun] += deltas[tid][irun];
            }
        }

        // Compute the median. numRuns should be an odd number
        sort(agg.begin(),agg.end());
        auto median = agg[numRuns/2].count()/numThreads; // Normalize back to per-thread time (mean of time for this run)

        cout << "Total Ops/sec = " << numPairs*2*NSEC_IN_SEC/median << "\n";
        return (numPairs*2*NSEC_IN_SEC/median);
    }


    /**
     * Start with only enqueues 100K/numThreads, wait for them to finish, then do only dequeues but only 100K/numThreads
     * TODO: must fix this for persistency, not yet working
     */
    template<typename Q>
    void burst(std::string& className, uint64_t& resultsEnq, uint64_t& resultsDeq,
               const long long burstSize, const int numIters, const int numRuns, const bool isSC=false) {
        Result results[numThreads][numRuns];
        atomic<bool> startEnq = { false };
        atomic<bool> startDeq = { false };
        atomic<long> barrier = { 0 };
        Q* queue = nullptr;

        auto burst_lambda = [this,&startEnq,&startDeq,&burstSize,&barrier,&numIters,&isSC,&queue](Result *res, const int tid) {
            UserData ud(0,0);
            // Warmup only if it is not Single-Consumer
            if (!isSC) {
                const long long warmupIters = 100000LL;  // Do 100K for each thread as a warmup
                for (long long iter = 0; iter < warmupIters; iter++) queue->enqueue(&ud, tid);
                for (long long iter = 0; iter < warmupIters; iter++) {
                    if (queue->dequeue(tid) == nullptr) cout << "ERROR: warmup dequeued nullptr in iter=" << iter << "\n";
                }
            }
            // Measurements
            for (int iter = 0; iter < numIters; iter++) {
                // Start with enqueues
                while (!startEnq.load()) {} // spin is better than yield here
                auto startBeats = steady_clock::now();
                for (long long i = 0; i < burstSize/numThreads; i++) {
                    queue->enqueue(&ud, tid);
                }
                auto stopBeats = steady_clock::now();
                res->nsEnq += (stopBeats-startBeats);
                res->numEnq += burstSize/numThreads;
                if (barrier.fetch_add(1) == numThreads) cout << "ERROR: in barrier\n";
                // dequeues
                while (!startDeq.load()) { } // spin is better than yield here
                if (isSC) { // Handle the single-consumer case
                    if (tid == 0) {
                        startBeats = steady_clock::now();
                        // We need to deal with rounding errors in the single-consumer case
                        for (long long i = 0; i < ((long long)(burstSize/numThreads))*numThreads; i++) {
                            if (queue->dequeue(tid) == nullptr) {
                                cout << "ERROR: dequeued nullptr in iter=" << i << "\n";
                                assert(false);
                            }
                        }
                        stopBeats = steady_clock::now();
                        if (queue->dequeue(tid) != nullptr) cout << "ERROR: dequeued non-null, there must be duplicate items!\n";
                        res->nsDeq += (stopBeats-startBeats);
                        res->numDeq += burstSize/numThreads;
                    }
                } else {
                    startBeats = steady_clock::now();
                    for (long long i = 0; i < burstSize/numThreads; i++) {
                        if (queue->dequeue(tid) == nullptr) {
                            cout << "ERROR: dequeued nullptr in iter=" << i << "\n";
                            assert(false);
                        }
                    }
                    stopBeats = steady_clock::now();
                    res->nsDeq += (stopBeats-startBeats);
                    res->numDeq += burstSize/numThreads;
                }
                if (barrier.fetch_add(1) == numThreads) cout << "ERROR: in barrier\n";
            }
        };

        for (int irun = 0; irun < numRuns; irun++) {
            queue = new Q(numThreads);
            if (irun == 0) {
                className = queue->className();
                cout << "##### " << queue->className() << " #####  \n";
            }
            thread burstThreads[numThreads];
            for (int tid = 0; tid < numThreads; tid++) burstThreads[tid] = thread(burst_lambda, &results[tid][irun], tid);
            this_thread::sleep_for(100ms);
            for (int iter=0; iter < numIters; iter++) {
                // enqueue round
                startEnq.store(true);
                while (barrier.load() != numThreads) this_thread::yield();
                startEnq.store(false);
                long tmp =  numThreads;
                if (!barrier.compare_exchange_strong(tmp, 0)) cout << "ERROR: CAS\n";
                // dequeue round
                startDeq.store(true);
                while (barrier.load() != numThreads) this_thread::yield();
                startDeq.store(false);
                tmp = numThreads;
                if (!barrier.compare_exchange_strong(tmp, 0)) cout << "ERROR: CAS\n";
            }
            for (int tid = 0; tid < numThreads; tid++) burstThreads[tid].join();
            delete queue;
        }

        // Accounting
        vector<Result> agg(numRuns);
        for (int irun = 0; irun < numRuns; irun++) {
            nanoseconds maxNsEnq = 0ns;
            nanoseconds maxNsDeq = 0ns;
            for (int tid = 0; tid < numThreads; tid++) {
                if (results[tid][irun].nsEnq > maxNsEnq) maxNsEnq = results[tid][irun].nsEnq;
                if (results[tid][irun].nsDeq > maxNsDeq) maxNsDeq = results[tid][irun].nsDeq;
                agg[irun].numEnq += results[tid][irun].numEnq;
                agg[irun].numDeq += results[tid][irun].numDeq;
            }
            agg[irun].nsEnq = maxNsEnq;
            agg[irun].nsDeq = maxNsDeq;
            agg[irun].totOpsSec = agg[irun].nsEnq.count()+agg[irun].nsDeq.count();
        }

        // Compute the median. numRuns should be an odd number
        sort(agg.begin(),agg.end());
        Result median = agg[numRuns/2];
        const long long allThreadsEnqPerSec = median.numEnq*NSEC_IN_SEC/median.nsEnq.count();
        const long long allThreadsDeqPerSec = median.numDeq*NSEC_IN_SEC/median.nsDeq.count();

        // Printed value is the median of the number of ops per second that all threads were able to accomplish (on average)
        cout << "Enq/sec = " << allThreadsEnqPerSec << "   Deq/sec = " << allThreadsDeqPerSec << "\n";
        resultsEnq = allThreadsEnqPerSec;
        resultsDeq = allThreadsDeqPerSec;
    }
};

#endif


================================================
FILE: graphs/PBenchmarkSPS.hpp
================================================
/*
 * Copyright 2017-2018
 *   Andreia Correia <andreia.veiga@unine.ch>
 *   Pedro Ramalhete <pramalhe@gmail.com>
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Nachshon Cohen <nachshonc@gmail.com>
 *
 * This work is published under the MIT license. See LICENSE.txt
 */
#ifndef _PERSISTENT_BENCHMARK_SPS_H_
#define _PERSISTENT_BENCHMARK_SPS_H_

#include <atomic>
#include <chrono>
#include <thread>
#include <string>
#include <vector>
#include <algorithm>
#include <cassert>
#include <iostream>
#include <typeinfo>

static const long arraySize=1000*1000;   // 1M entries in the SPS array

using namespace std;
using namespace chrono;


/**
 * This is a micro-benchmark with integer swaps (SPS) for PTMs
 */
class PBenchmarkSPS {

private:
    int numThreads;

public:
    struct UserData  {
        long long seq;
        int tid;
        UserData(long long lseq, int ltid) {
            this->seq = lseq;
            this->tid = ltid;
        }
        UserData() {
            this->seq = -2;
            this->tid = -2;
        }
        UserData(const UserData &other) : seq(other.seq), tid(other.tid) { }

        bool operator < (const UserData& other) const {
            return seq < other.seq;
        }
    };

    PBenchmarkSPS(int numThreads) {
        this->numThreads = numThreads;
    }


    /*
     * An array of integers that gets randomly permutated.
     */
    template<typename PTM, template<typename> class PERSIST>
    uint64_t benchmarkSPSInteger(std::string& className, const seconds testLengthSeconds, const long numSwapsPerTx, const int numRuns) {
        long long ops[numThreads][numRuns];
        long long lengthSec[numRuns];
        atomic<bool> startFlag = { false };
        atomic<bool> quit = { false };

        // Create the array of integers and initialize it, saving it in root pointer 0
        int larraySize = arraySize;
        PTM::template updateTx<bool>([larraySize] () {
            //PTM::pfree( PTM::template get_object<PERSIST<uint64_t>>(0) ); // TODO: re-enable this after we add the clear of objects as a transaction in CX
            PTM::put_object(0, PTM::pmalloc( larraySize*sizeof(PERSIST<uint64_t>*) ));
            return true;
        });
        // Break up the initialization into transactions of 1k stores, so it fits in the log
        for (long j = 0; j < arraySize; j+=1000) {
            PTM::template updateTx<bool>([larraySize,j] () {
                PERSIST<uint64_t>* parray = PTM::template get_object<PERSIST<uint64_t>>(0);
                for (int i = 0; i < 1000 && i+j < larraySize; i++) parray[i+j] = i+j;
                return true;
            } );
        }

        auto func = [this,&startFlag,&quit,&numSwapsPerTx](long long *ops, const int tid) {
            uint64_t seed = (tid*1024)+tid+1234567890123456781ULL;
            int larraySize = arraySize;
            // Spin until the startFlag is set
            while (!startFlag.load()) {}
            // Do transactions until the quit flag is set
            long long tcount = 0;
            while (!quit.load()) {
                // Everything has to be captured by value, or get/put in root pointers
                PTM::template updateTx<bool>([seed,numSwapsPerTx,larraySize] () {
                    PERSIST<uint64_t>* parray = PTM::template get_object<PERSIST<uint64_t>>(0);
                    uint64_t lseed = seed;
                    for (int i = 0; i < numSwapsPerTx; i++) {
                        lseed = randomLong(lseed);
                        auto ia = lseed%arraySize;
                        uint64_t tmp = parray[ia];
                        lseed = randomLong(lseed);
                        auto ib = lseed%arraySize;
                        parray[ia] = parray[ib];
                        parray[ib] = tmp;
                    }
                    return true;
                });
                // Can't have capture by ref for wait-free, so replicate seed advance outside tx
                seed = randomLong(seed);
                seed = randomLong(seed);
                ++tcount;
                /*
                    PE::read_transaction([this,&seed,&parray,&numWordsPerTransaction] () {
                        PersistentArrayInt<persist>* read_array = PE::template get_object<PersistentArrayInt<persist>>(PIDX_INT_ARRAY);
                        // Check that the array is consistent
                        int sum = 0;
                        for (int i = 0; i < arraySize; i++) {
                            sum += read_array->counters[i];
                        }
                        assert(sum == 0);
                    } );
                */
            }
            *ops = tcount;
        };
        for (int irun = 0; irun < numRuns; irun++) {
            if (irun == 0) {
                className = PTM::className();
                cout << "##### " << PTM::className() << " #####  \n";
            }
            thread enqdeqThreads[numThreads];
            for (int tid = 0; tid < numThreads; tid++) enqdeqThreads[tid] = thread(func, &ops[tid][irun], tid);
            auto startBeats = steady_clock::now();
            startFlag.store(true);
            // Sleep for 20 seconds
            this_thread::sleep_for(testLengthSeconds);
            quit.store(true);
            auto stopBeats = steady_clock::now();
            for (int tid = 0; tid < numThreads; tid++) enqdeqThreads[tid].join();
            lengthSec[irun] = (stopBeats-startBeats).count();
            startFlag.store(false);
            quit.store(false);
        }

        PTM::template updateTx<bool>([] () {
            PTM::pfree( PTM::template get_object<PERSIST<uint64_t>>(0) );
            PTM::template put_object<void>(0, nullptr);
            return true;
        });

        // Accounting
        vector<long long> agg(numRuns);
        for (int irun = 0; irun < numRuns; irun++) {
        	for(int i=0;i<numThreads;i++){
        		agg[irun] += ops[i][irun]*1000000000LL/lengthSec[irun];
        	}
        }
        // Compute the median. numRuns should be an odd number
        sort(agg.begin(),agg.end());
        auto maxops = agg[numRuns-1];
        auto minops = agg[0];
        auto medianops = agg[numRuns/2];
        auto delta = (long)(100.*(maxops-minops) / ((double)medianops));
        // Printed value is the median of the number of ops per second that all threads were able to accomplish (on average)
        std::cout << "Swaps/sec = " << medianops*numSwapsPerTx << "     delta = " << delta*numSwapsPerTx << "%   min = " << minops*numSwapsPerTx << "   max = " << maxops*numSwapsPerTx << "\n";
        return medianops*numSwapsPerTx;
    }


    /**
     * An imprecise but fast random number generator
     */
    static uint64_t randomLong(uint64_t x) {
        x ^= x >> 12; // a
        x ^= x << 25; // b
        x ^= x >> 27; // c
        return x * 2685821657736338717LL;
    }
};

#endif


================================================
FILE: graphs/PBenchmarkSets.hpp
================================================
/*
 * Copyright 2017-2018
 *   Andreia Correia <andreia.veiga@unine.ch>
 *   Pedro Ramalhete <pramalhe@gmail.com>
 *   Pascal Felber <pascal.felber@unine.ch>
 *
 * This work is published under the MIT license. See LICENSE.txt
 */
#ifndef _PERSISTENT_BENCHMARK_SETS_H_
#define _PERSISTENT_BENCHMARK_SETS_H_

#include <atomic>
#include <chrono>
#include <thread>
#include <string>
#include <vector>
#include <algorithm>
#include <iostream>

using namespace std;
using namespace chrono;

template <template <typename> class TMTYPE>
struct UserData  {
    TMTYPE<long long> seq;
    TMTYPE<int> tid;
    UserData(long long lseq, int ltid=0) {
        this->seq = lseq;
        this->tid = ltid;
    }
    UserData() {
        this->seq = -2;
        this->tid = -2;
    }
    UserData(const UserData &other) : seq(other.seq), tid(other.tid) { }

    bool operator < (const UserData& other) const {
        return seq.pload() < other.seq.pload();
    }
    bool operator == (const UserData& other) const {
        return seq.pload() == other.seq.pload() && tid.pload() == other.tid.pload();
    }
};

#ifdef NEVER
namespace std {
    template <>
    struct hash<UserData> {
        std::size_t operator()(const UserData& k) const {
            using std::size_t;
            using std::hash;
            return (hash<long long>()(k.seq.pload()));  // This hash has no collisions, which is irealistic
            /*
            long long x = k.seq;
            x ^= x >> 12; // a
            x ^= x << 25; // b
            x ^= x >> 27; // c
            return hash<long long>()(x * 2685821657736338717LL);
            */
        }
    };
}
#endif


/**
 * This is a micro-benchmark of sets, used in the CX paper
 */
template<typename K>
class PBenchmarkSets {

private:
    struct Result {
        nanoseconds nsEnq = 0ns;
        nanoseconds nsDeq = 0ns;
        long long numEnq = 0;
        long long numDeq = 0;
        long long totOpsSec = 0;

        Result() { }

        Result(const Result &other) {
            nsEnq = other.nsEnq;
            nsDeq = other.nsDeq;
            numEnq = other.numEnq;
            numDeq = other.numDeq;
            totOpsSec = other.totOpsSec;
        }

        bool operator < (const Result& other) const {
            return totOpsSec < other.totOpsSec;
        }
    };

    static const long long NSEC_IN_SEC = 1000000000LL;

    bool firstTime = true;

public:
    /**
     * When doing "updates" we execute a random removal and if the removal is successful we do an add() of the
     * same item immediately after. This keeps the size of the data structure equal to the original size (minus
     * MAX_THREADS items at most) which gives more deterministic results.
     */
    template<typename S, typename PTM>
    long long benchmark(std::string& className, int numThreads, const int updateRatio, const seconds testLengthSeconds, const int numRuns, const int numElements, const bool dedicated=false) {
    	int num_threads = numThreads;
    	if (dedicated) num_threads = numThreads+2;
        long long ops[num_threads][numRuns];
        long long lengthSec[numRuns];
        atomic<bool> quit = { false };
        atomic<bool> startFlag = { false };
        atomic<int> startAtZero = { false };

        className = S::className();
        std::cout << "##### " << S::className() << " #####  \n";
        S* set = nullptr;

        // Create all the keys in the concurrent set
        K** udarray = new K*[numElements];
        for (int i = 0; i < numElements; i++) udarray[i] = new K(i);
        // Create the set in persistent memory
        PTM::template updateTx<bool>([&] () {
            set = PTM::template tmNew<S>();
            return true;
        });
        // Add all the items, one at a time, otherwise the transaction is too big to fit in the logs
        set->addAll(udarray, numElements);

        // Can either be a Reader or a Writer
        auto rw_lambda = [this,&quit,&startFlag,&startAtZero,&set,&udarray,&numElements](const int updateRatio, long long *ops, const int tid) {
            long long numOps = 0;
            uint64_t seed = tid*133 + 1234567890123456781ULL;
            if (firstTime) {
                // Execute 1k iterations as warmup and then spin wwait for all other threads
                for (uint64_t iter = 0; iter < 1000; iter++) {
                    seed = randomLong(seed);
                    auto ix = (unsigned int)(seed%numElements);
                    if (set->remove(*udarray[ix])) set->add(*udarray[ix]);
                }
            }
            startAtZero.fetch_add(-1);
            // spin waiting for all other threads before starting the measurements
            // (we wait for startAtZero to be zero on the main thread).
            while (!startFlag.load()) ; // spin
            while (!quit.load()) {
                seed = randomLong(seed);
                int update = seed%1000;
                seed = randomLong(seed);
                auto ix = (unsigned int)(seed%numElements);
                if (update < updateRatio) {
                    // I'm a Writer
                    if (set->remove(*udarray[ix])) {
                    	numOps++;
                    	set->add(*udarray[ix]);
                    }
                    numOps++;
                } else {
                	// I'm a Reader
                    set->contains(*udarray[ix]);
                    seed = randomLong(seed);
                    ix = (unsigned int)(seed%numElements);
                    set->contains(*udarray[ix]);
                    numOps+=2;
                }

            }
            *ops = numOps;
        };

        for (int irun = 0; irun < numRuns; irun++) {
            startAtZero.store(num_threads);
            thread rwThreads[num_threads];
            if (dedicated) {
                rwThreads[0] = thread(rw_lambda, 1000, &ops[0][irun], 0);
                rwThreads[1] = thread(rw_lambda, 1000, &ops[1][irun], 1);
                for (int tid = 2; tid < num_threads; tid++) rwThreads[tid] = thread(rw_lambda, updateRatio, &ops[tid][irun], tid);
            } else {
                for (int tid = 0; tid < num_threads; tid++) rwThreads[tid] = thread(rw_lambda, updateRatio, &ops[tid][irun], tid);
            }
            this_thread::sleep_for(100ms);
            // Wait for startAtZero to be zero (all threads have done the 1k iteration warmup)
            while (startAtZero.load() != 0) ;
            auto startBeats = steady_clock::now();
            startFlag.store(true);
            // Sleep for testLengthSeconds seconds
            this_thread::sleep_for(testLengthSeconds);
            quit.store(true);
            auto stopBeats = steady_clock::now();
            for (int tid = 0; tid < num_threads; tid++) {
            	rwThreads[tid].join();
            }
            lengthSec[irun] = (stopBeats-startBeats).count();
            if (dedicated) {
                // We don't account for the write-only operations but we aggregate the values from the two threads and display them
                std::cout << "Mutative transactions per second = " << (ops[0][irun] + ops[1][irun])*1000000000LL/lengthSec[irun] << "\n";
                ops[0][irun] = 0;
                ops[1][irun] = 0;
            }
            quit.store(false);
            startFlag.store(false);
            // Compute ops at the end of each run
            long long agg = 0;
            for (int tid = 0; tid < num_threads; tid++) {
                agg += ops[tid][irun]*1000000000LL/lengthSec[irun];
            }
            firstTime = false;
        }

        // Clear the set, one key at a time and then delete the instance
        for (int i = 0; i < numElements; i++) {
            PTM::template updateTx<bool>([=] () {
                set->remove(*udarray[i]);
                return true;
            });
        }
        PTM::template updateTx<bool>([=] () {
            PTM::tmDelete(set);
            return true;
        });
        // Delete all the keys
        for (int i = 0; i < numElements; i++) delete udarray[i];
        delete[] udarray;

        // Accounting
        vector<long long> agg(numRuns);
        for (int irun = 0; irun < numRuns; irun++) {
            for (int tid = 0; tid < num_threads; tid++) {
                agg[irun] += ops[tid][irun]*1000000000LL/lengthSec[irun];
            }
        }

        // Compute the median. numRuns must be an odd number
        sort(agg.begin(),agg.end());
        auto maxops = agg[numRuns-1];
        auto minops = agg[0];
        auto medianops = agg[numRuns/2];
        auto delta = (long)(100.*(maxops-minops) / ((double)medianops));
        // Printed value is the median of the number of ops per second that all threads were able to accomplish (on average)
        std::cout << "Ops/sec = " << medianops << "      delta = " << delta << "%   min = " << minops << "   max = " << maxops << "\n";
        return medianops;
    }


    /**
     * An imprecise but fast random number generator
     */
    uint64_t randomLong(uint64_t x) {
        x ^= x >> 12; // a
        x ^= x << 25; // b
        x ^= x >> 27; // c
        return x * 2685821657736338717LL;
    }
};

#endif


================================================
FILE: graphs/README.md
================================================
Every .cpp file here outputs a plot taken from a benchmark.
The corresponding tab-separated values are placed in the data/ folder and can then be read by gnuplot to make the actual plots.
The .gp and corresponding pdf files are in the plots/ folder.
To generate the plots, type:
cd plots
./plot-all.sh


================================================
FILE: graphs/bin/.gitignore
================================================
/pset-ll-1k
/psps-integer
/q-array-enq-deq
/q-ll-burst
/q-ll-enq-deq
/q-ll-enq-deq-tiny
/set-hash-1k-tiny
/set-ll-10k
/set-ll-10k-tiny
/set-ll-1k
/set-ll-1k-tiny
/set-tree-10k
/set-tree-10k-tiny
/set-tree-1k
/set-tree-1k-tiny
/sps-integer
/sps-integer-oversubs
/sps-integer-tiny
/sps-object
/sps-object-tiny
/psps-integer-atlas
/set-hash-1k
/pset-ll-10k
/pset-hash-1k
/pq-ll-enq-deq
/pset-tree-1k
/q-array-enq-deq-tiny
/latency-counter
/latency-counter-tiny
/set-tree-1m
/set-tree-1m-tiny
/pset-tree-1m
/pset-tree-1m-oflf
/pset-tree-1m-ofwf
/pset-tree-1m-pmdk
/pset-tree-1m-romlog
/pset-tree-1m-romlr
/pread-while-writing-oflf
/pread-while-writing-ofwf
/pread-while-writing-pmdk
/pread-while-writing-romlog
/pread-while-writing-romlr


================================================
FILE: graphs/data/README.md
================================================
The benchmarks will generate data here

================================================
FILE: graphs/latency-counter.cpp
================================================
#include <iostream>
#include <fstream>
#include <cstring>
#include "BenchmarkLatencyCounter.hpp"
#include "stms/OneFileLF.hpp"
#include "stms/OneFileWF.hpp"
#if defined USE_TINY
#include "stms/TinySTM.hpp"
#define DATA_FILENAME "data/latency-counter-tiny.txt"
#else
#include "stms/ESTM.hpp"
#define DATA_FILENAME "data/latency-counter.txt"
#endif


int main(void) {
    const std::string dataFilename {DATA_FILENAME};
    vector<int> threadList = { 1, 2, 4, 8 };             // For the laptop
    //vector<int> threadList = { 1, 2, 4, 8, 16, 32, 62 };             // For the cervino
    const int EMAX_CLASS = 10;
    BenchmarkLatencyCounter::Result results[EMAX_CLASS][threadList.size()];
    std::string cNames[EMAX_CLASS];
    int maxClass = 0;
    // Reset results
    std::memset(results, 0, sizeof(uint64_t)*EMAX_CLASS*threadList.size());

    for (unsigned it = 0; it < threadList.size(); it++) {
        auto nThreads = threadList[it];
        int ic = 0;
        BenchmarkLatencyCounter bench(nThreads);
        std::cout << "\n----- Latency Counter    nThreads=" << nThreads << " -----\n";
#if defined USE_TINY
        results[ic][it] = bench.latencyBenchmark<tinystm::TinySTM,tinystm::tmtype>(cNames[ic]);
        ic++;
#else
        results[ic][it] = bench.latencyBenchmark<oflf::OneFileLF,oflf::tmtype>(cNames[ic]);
        ic++;
        results[ic][it] = bench.latencyBenchmark<ofwf::OneFileWF,ofwf::tmtype>(cNames[ic]);
        ic++;
        results[ic][it] = bench.latencyBenchmark<estm::ESTM,estm::tmtype>(cNames[ic]);
        ic++;
#endif
        maxClass = ic;
     }

    // Export tab-separated values to a file to be imported in gnuplot or excel
    ofstream dataFile;
    dataFile.open(dataFilename);
    dataFile << "Threads\t";
    // Printf class names and percentiles for each column
    for (int ic = 0; ic < maxClass; ic++) {
        dataFile << cNames[ic] << "-50%"<< "\t";
        dataFile << cNames[ic] << "-90%"<< "\t";
        dataFile << cNames[ic] << "-99%"<< "\t";
        dataFile << cNames[ic] << "-99.9%"<< "\t";
        dataFile << cNames[ic] << "-99.99%"<< "\t";
        dataFile << cNames[ic] << "-99.999%"<< "\t";
    }
    dataFile << "\n";
    for (int it = 0; it < threadList.size(); it++) {
        dataFile << threadList[it] << "\t";
        for (int ic = 0; ic < maxClass; ic++) {
            dataFile << results[ic][it].delay50000 << "\t";
            dataFile << results[ic][it].delay90000 << "\t";
            dataFile << results[ic][it].delay99000 << "\t";
            dataFile << results[ic][it].delay99900 << "\t";
            dataFile << results[ic][it].delay99990 << "\t";
            dataFile << results[ic][it].delay99999 << "\t";
        }
        dataFile << "\n";
    }
    dataFile.close();
    std::cout << "\nSuccessfuly saved results in " << dataFilename << "\n";

    return 0;
}


================================================
FILE: graphs/lib/.gitignore
================================================
/libromulus.a
/mallocromlog.o
/mallocromlr.o
/romlog.o
/romlr.o
/threadregistry.o


================================================
FILE: graphs/plots/caption.gp
================================================
set term postscript color eps enhanced 22
set output "caption.eps"

set size 0.95,0.15

load "styles.inc"

unset tics
unset border
unset xlabel
unset ylabel
unset label

set object 1 rectangle from screen 0.02,0.01 to screen 0.925,0.14 fillcolor rgb "white" dashtype (2,3) behind
set label at screen 0.5,0.11 center "{/Helvetica-bold Legend for volatile memory} (all graphs of {\247}V.A)"

set key at screen 0.9,0.07 samplen 1.5 maxrows 2 width 0.25
plot [][0:1] \
    2 with linespoints title 'OF-LF'        ls 1 lw 3 dt 1, \
    2 with linespoints title 'OF-WF'        ls 2 lw 3 dt 1, \
    2 with linespoints title 'ESTM'         ls 3 lw 3 dt (1,1), \
    2 with linespoints title 'Tiny'         ls 5 lw 3 dt (1,1)


================================================
FILE: graphs/plots/latency-counter.gp
================================================
set term postscript color eps enhanced 22
set output "latency-counter.eps"

set size 0.95,1.12

X=0.1
W=0.26
M=0.025

load "styles.inc"

set tmargin 0
set bmargin 3

set multiplot layout 2,3

unset key

set grid ytics

set xtics ("" 1, 2, 4, 8, 16, 32, 64) nomirror out offset -0.25,0.5
set label at screen 0.5,0.04 center "Number of threads"
set label at screen 0.5,1.09 center "Latency when incrementing an array of 64 counters"

set logscale x
set logscale y
set xrange [1:64]

# First row

set lmargin at screen X
set rmargin at screen X+W

set ylabel offset 0,0 "Tx time duration ({/Symbol m}s)"
set ytics offset 0.5,0
set ytics add ("" 1e1, "" 1e3, "" 1e5, "" 1e7)
set format y "10^{%T}"
set yrange [1e0:1e8]

set label at graph 0.2,1.075 font "Helvetica-bold,18" "50% (median)"

plot \
	'../data/latency-counter-10m-cervino.txt'      using 1:2  with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/latency-counter-10m-cervino.txt'      using 1:8  with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/latency-counter-10m-cervino.txt'      using 1:14 with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/latency-counter-10m-cervino-tiny.txt' using 1:2  with linespoints notitle ls 5 lw 3 dt (1,1)

unset ylabel
set ytics format ""

set lmargin at screen X+(W+M)
set rmargin at screen X+(W+M)+W

unset label
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "90%"

plot \
    '../data/latency-counter-10m-cervino.txt'      using 1:3  with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/latency-counter-10m-cervino.txt'      using 1:9  with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/latency-counter-10m-cervino.txt'      using 1:15 with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/latency-counter-10m-cervino-tiny.txt' using 1:3  with linespoints notitle ls 5 lw 3 dt (1,1)

set lmargin at screen X+2*(W+M)
set rmargin at screen X+2*(W+M)+W

unset label
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "99%"

plot \
    '../data/latency-counter-10m-cervino.txt'      using 1:4  with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/latency-counter-10m-cervino.txt'      using 1:10 with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/latency-counter-10m-cervino.txt'      using 1:16 with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/latency-counter-10m-cervino-tiny.txt' using 1:4  with linespoints notitle ls 5 lw 3 dt (1,1)

# Second row

set lmargin at screen X
set rmargin at screen X+W

set ylabel offset 0,0 "Tx time duration ({/Symbol m}s)"
set ytics offset 0.5,0
set format y "10^{%T}"
set yrange [1e0:1e8]

unset label
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "99.9%"

plot \
    '../data/latency-counter-10m-cervino.txt'      using 1:5  with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/latency-counter-10m-cervino.txt'      using 1:11 with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/latency-counter-10m-cervino.txt'      using 1:17 with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/latency-counter-10m-cervino-tiny.txt' using 1:5  with linespoints notitle ls 5 lw 3 dt (1,1)

unset ylabel
set ytics format ""

set lmargin at screen X+(W+M)
set rmargin at screen X+(W+M)+W

unset label
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "99.99%"

plot \
    '../data/latency-counter-10m-cervino.txt'      using 1:6  with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/latency-counter-10m-cervino.txt'      using 1:12 with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/latency-counter-10m-cervino.txt'      using 1:18 with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/latency-counter-10m-cervino-tiny.txt' using 1:6  with linespoints notitle ls 5 lw 3 dt (1,1)

set lmargin at screen X+2*(W+M)
set rmargin at screen X+2*(W+M)+W

unset label
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "99.999%"

plot \
    '../data/latency-counter-10m-cervino.txt'      using 1:7  with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/latency-counter-10m-cervino.txt'      using 1:13 with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/latency-counter-10m-cervino.txt'      using 1:19 with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/latency-counter-10m-cervino-tiny.txt' using 1:7  with linespoints notitle ls 5 lw 3 dt (1,1)


================================================
FILE: graphs/plots/pcaption.gp
================================================
set term postscript color eps enhanced 22
set output "pcaption.eps"

set size 0.95,0.15

load "styles.inc"

unset tics
unset border
unset xlabel
unset ylabel
unset label

set object 1 rectangle from screen 0.02,0.01 to screen 0.925,0.14 fillcolor rgb "white" dashtype (2,3) behind
set label at screen 0.5,0.11 center "{/Helvetica-bold Legend for non-volatile memory} (all graphs of {\247}V.B)"

set key at screen 0.9,0.07 samplen 1.5 maxrows 2 width 0.25
plot [][0:1] \
    2 with linespoints title 'OF-LF'        ls 1 lw 3 dt 1, \
    2 with linespoints title 'OF-WF'        ls 2 lw 3 dt 1, \
    2 with linespoints title 'RomL'         ls 3 lw 3 dt (1,1), \
    2 with linespoints title 'RomLR'        ls 4 lw 3 dt (1,1), \
    2 with linespoints title 'PMDK'         ls 5 lw 3 dt (1,1) 


================================================
FILE: graphs/plots/plot-all.sh
================================================
#!/bin/sh

for i in \
caption.gp \
sps-integer.gp \
sps-object.gp \
q-enq-deq.gp \
set-ll-1k.gp \
set-hash-1k.gp \
set-tree-1k.gp \
set-tree-10k.gp \
latency-counter.gp \
pcaption.gp \
psps-integer.gp \
pset-ll-1k.gp \
pset-tree-1k.gp \
pset-hash-1k.gp \
pq-ll-enq-deq.gp \
;
do
  echo "Processing:" $i
  gnuplot $i
  epstopdf `basename $i .gp`.eps
  rm `basename $i .gp`.eps
done


================================================
FILE: graphs/plots/plot.sh
================================================
#!/bin/sh

while [ $# -gt 0 ]
do
  echo "Processing:" $1
  gnuplot $1
  epstopdf `basename $1 .gp`.eps
  rm `basename $1 .gp`.eps
  open `basename $1 .gp`.pdf
  shift
done


================================================
FILE: graphs/plots/pq-enq-deq.gp
================================================
set term postscript color eps enhanced 22
set output "pq-enq-deq.eps"

set size 0.95,0.6

X=0.08
W=0.370
M=0.100

load "styles.inc"

set tmargin 10.5
set bmargin 3

set multiplot layout 1,2

unset key

set grid ytics

set xtics (" 1" 1, 2, 4, 8, 16, "32 " 32, 64) nomirror out offset -0.25,0.5
set label at screen 0.25,0.04 center "Number of threads"

set logscale x
set xrange [1:64]

# First row

set lmargin at screen X
set rmargin at screen X+W

set ylabel offset 1.5,0 "Operations ({/Symbol \264}10^6/s)"
set ytics 1 offset 0.5,0
set yrange [0:4]

set label at graph 0.5,1.075 center font "Helvetica-bold,18" "Linked list-based queue"
set key at graph 0.50,0.99 samplen 1.5

plot \
    '../data/pq-ll-enq-deq.txt' using 1:($2/1e6) with linespoints notitle ls 1 lw 2 dt 1,     \
    '../data/pq-ll-enq-deq.txt' using 1:($3/1e6) with linespoints notitle ls 2 lw 2 dt 1,     \
    '../data/pq-ll-enq-deq.txt' using 1:($4/1e6) with linespoints notitle ls 3 lw 2 dt (1,1), \
    '../data/pq-ll-enq-deq.txt' using 1:($5/1e6) with linespoints notitle ls 4 lw 2 dt (1,1), \
    '../data/pq-ll-enq-deq.txt' using 1:($6/1e6) with linespoints notitle ls 5 lw 2 dt (1,1), \
    '../data/pq-ll-enq-deq.txt' using 1:($7/1e6) with linespoints title "FHMP" ls 6 lw 2 dt (1,1)

unset ylabel
set ytics 10 offset 0.5,0
set yrange [0:40]

set lmargin at screen X+(W+M)
set rmargin at screen X+(W+M)+W

unset label
unset arrow
set xtics (2, 4, 8, 16, 32) nomirror out offset -0.25,0.5
set ylabel offset 1.5,0 "Transactions ({/Symbol \264}10^6/s)"
set label at screen 0.75,0.04 center "Number of processes"

set ytics ("" 0.5,1,"" 1.5,2) offset 0.5,0
set xrange [2:32]
set yrange [0:2]

set label at graph 0.5,1.075 center font "Helvetica-bold,18" "Swapping item between queues"
set key at graph 0.99,0.99 samplen 1.5

# we divide by 1e8 because the data is in total number of tx and we want tx/second
plot \
    '../data/stress-multi-process-q-nokills.txt' using 1:($2/1e8) with linespoints title "No kill"            ls 1 lw 4 dt (1,1), \
    '../data/stress-multi-process-q-kills.txt'   using 1:($2/1e8) with linespoints title "1 kill every 100ms" ls 2 lw 2 dt 1


================================================
FILE: graphs/plots/pq-ll-enq-deq.gp
================================================
set term postscript color eps enhanced 22
set output "pq-ll-enq-deq.eps"

set size 0.95,0.6

S=0.2125
X=0.1
W=0.375
M=0.075

load "styles.inc"

set tmargin 10.5
set bmargin 3

# We can fit a second graph if need be (remove S "hack")
set multiplot layout 1,2

unset key

set grid ytics

set xtics ("" 1, 2, 4, 8, 16, 32, 64) nomirror out offset -0.25,0.5
set label at screen 0.5,0.04 center "Number of threads"

set logscale x
set xrange [1:64]

# First row

set lmargin at screen S+X
set rmargin at screen S+X+W

set ylabel offset 1.5,0 "Operations ({/Symbol \264}10^6/s)"
set ytics 1 offset 0.5,0
set yrange [0:4]

set label at graph 0.5,1.075 center font "Helvetica-bold,18" "Linked list-based queue"
set key at graph 0.99,0.99 samplen 1.5

plot \
    '../data/pq-ll-enq-deq.txt' using 1:($2/1e6) with linespoints notitle ls 1 lw 2 dt 1,     \
    '../data/pq-ll-enq-deq.txt' using 1:($3/1e6) with linespoints notitle ls 2 lw 2 dt 1,     \
    '../data/pq-ll-enq-deq.txt' using 1:($4/1e6) with linespoints notitle ls 3 lw 2 dt (1,1), \
    '../data/pq-ll-enq-deq.txt' using 1:($5/1e6) with linespoints notitle ls 4 lw 2 dt (1,1), \
    '../data/pq-ll-enq-deq.txt' using 1:($6/1e6) with linespoints notitle ls 5 lw 2 dt (1,1), \
    '../data/pq-ll-enq-deq.txt' using 1:($7/1e6) with linespoints title "FHMP" ls 6 lw 2 dt (1,1)


================================================
FILE: graphs/plots/pset-hash-1k.gp
================================================
set term postscript color eps enhanced 22
set output "pset-hash-1k.eps"

set size 0.95,1.12

X=0.1
W=0.26
M=0.025

load "styles.inc"

set tmargin 0
set bmargin 3

set multiplot layout 2,3

unset key

set grid ytics

set xtics ("" 1, "" 2, 4, "" 8, 16, 32, 48, 64) nomirror out offset -0.25,0.5
set label at screen 0.5,0.04 center "Number of threads"
set label at screen 0.5,1.09 center "Persistent resizable hash table sets with 10^{3} keys"

#set logscale x
set xrange [1:64]

# First row

set lmargin at screen X
set rmargin at screen X+W

set ylabel offset -0.5,0 "Operations ({/Symbol \264}10^6/s)"
set ytics 1 offset 0.5,0
set format y "%g"
set yrange [0:3]

set label at graph 0.5,1.075 center font "Helvetica-bold,18" "100%"

set key at graph 0.99,0.99 samplen 1.5

plot \
	'../data/pset-hash-1k.txt'     using 1:($2/1e6)  with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/pset-hash-1k.txt'     using 1:($3/1e6)  with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/pset-hash-1k.txt'     using 1:($4/1e6)  with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/pset-hash-1k.txt'     using 1:($5/1e6)  with linespoints notitle ls 4 lw 3 dt (1,1), \
    '../data/pset-hash-1k.txt'     using 1:($6/1e6)  with linespoints notitle ls 5 lw 3 dt (1,1)

unset ylabel
set ytics format ""

set lmargin at screen X+(W+M)
set rmargin at screen X+(W+M)+W

unset label
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "50%"

plot \
    '../data/pset-hash-1k.txt'     using 1:($7/1e6)  with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/pset-hash-1k.txt'     using 1:($8/1e6)  with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/pset-hash-1k.txt'     using 1:($9/1e6)  with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/pset-hash-1k.txt'     using 1:($10/1e6) with linespoints notitle ls 4 lw 3 dt (1,1), \
    '../data/pset-hash-1k.txt'     using 1:($11/1e6) with linespoints notitle ls 5 lw 3 dt (1,1)

set lmargin at screen X+2*(W+M)
set rmargin at screen X+2*(W+M)+W

unset label
set ytics 2 offset 0.5,0
set yrange [0:14]
set style textbox opaque noborder fillcolor rgb "white"
set label at first 1,14 front boxed left offset -0.5,0 "14"
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "10%"

plot \
    '../data/pset-hash-1k.txt'     using 1:($12/1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/pset-hash-1k.txt'     using 1:($13/1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/pset-hash-1k.txt'     using 1:($14/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/pset-hash-1k.txt'     using 1:($15/1e6) with linespoints notitle ls 4 lw 3 dt (1,1), \
    '../data/pset-hash-1k.txt'     using 1:($16/1e6) with linespoints notitle ls 5 lw 3 dt (1,1)

# Second row

set lmargin at screen X
set rmargin at screen X+W

set ylabel offset 1.5,0 "Operations ({/Symbol \264}10^6/s)"
set ytics 25 offset 0.5,0
set format y "%g"
set yrange [0:125]

unset label
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "1%"

plot \
    '../data/pset-hash-1k.txt'     using 1:($17/1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/pset-hash-1k.txt'     using 1:($18/1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/pset-hash-1k.txt'     using 1:($19/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/pset-hash-1k.txt'     using 1:($20/1e6) with linespoints notitle ls 4 lw 3 dt (1,1), \
    '../data/pset-hash-1k.txt'     using 1:($21/1e6) with linespoints notitle ls 5 lw 3 dt (1,1)

unset ylabel
set ytics format ""

set lmargin at screen X+(W+M)
set rmargin at screen X+(W+M)+W

unset label
set yrange [0:200]
set style textbox opaque noborder fillcolor rgb "white"
set label at first 1,200 front boxed left offset -0.5,0 "200"
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "0.1%"

plot \
    '../data/pset-hash-1k.txt'     using 1:($22/1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/pset-hash-1k.txt'     using 1:($23/1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/pset-hash-1k.txt'     using 1:($24/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/pset-hash-1k.txt'     using 1:($25/1e6) with linespoints notitle ls 4 lw 3 dt (1,1), \
    '../data/pset-hash-1k.txt'     using 1:($26/1e6) with linespoints notitle ls 5 lw 3 dt (1,1)

set lmargin at screen X+2*(W+M)
set rmargin at screen X+2*(W+M)+W

unset label
set ytics 100 offset 0.5,0
set yrange [0:850]
set style textbox opaque noborder fillcolor rgb "white"
set label at first 1,850 front boxed left offset -0.5,0 "850"
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "0%"

plot \
    '../data/pset-hash-1k.txt'     using 1:($27/1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/pset-hash-1k.txt'     using 1:($28/1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/pset-hash-1k.txt'     using 1:($29/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/pset-hash-1k.txt'     using 1:($30/1e6) with linespoints notitle ls 4 lw 3 dt (1,1), \
    '../data/pset-hash-1k.txt'     using 1:($31/1e6) with linespoints notitle ls 5 lw 3 dt (1,1)


================================================
FILE: graphs/plots/pset-ll-1k.gp
================================================
set term postscript color eps enhanced 22
set output "pset-ll-1k.eps"

set size 0.95,1.12

X=0.1
W=0.26
M=0.025

load "styles.inc"

set tmargin 0
set bmargin 3

set multiplot layout 2,3

unset key

set grid ytics

set xtics ("" 1, "" 2, 4, "" 8, 16, 32, 48, 64) nomirror out offset -0.25,0.5
set label at screen 0.5,0.04 center "Number of threads"
set label at screen 0.5,1.09 center "Persistent linked list sets with 10^{3} keys"

#set logscale x
set xrange [1:64]

# First row

set lmargin at screen X
set rmargin at screen X+W

set ylabel offset 1.5,0 "Operations ({/Symbol \264}10^6/s)"
set ytics 0.2 offset 0.5,0
set format y "%g"
set yrange [0:0.7]

set label at graph 0.5,1.075 center font "Helvetica-bold,18" "100%"

set key at graph 0.99,0.99 samplen 1.5

plot \
	'../data/pset-ll-1k.txt'     using 1:($2/1e6)  with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/pset-ll-1k.txt'     using 1:($3/1e6)  with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/pset-ll-1k.txt'     using 1:($4/1e6)  with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/pset-ll-1k.txt'     using 1:($5/1e6)  with linespoints notitle ls 4 lw 3 dt (1,1), \
    '../data/pset-ll-1k.txt'     using 1:($6/1e6)  with linespoints notitle ls 5 lw 3 dt (1,1)

unset ylabel
set ytics format ""

set lmargin at screen X+(W+M)
set rmargin at screen X+(W+M)+W

unset label
set yrange [0:1]
set style textbox opaque noborder fillcolor rgb "white"
set label at first 1,1 front boxed left offset -0.5,0 "1"
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "50%"

plot \
    '../data/pset-ll-1k.txt'     using 1:($7/1e6)  with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/pset-ll-1k.txt'     using 1:($8/1e6)  with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/pset-ll-1k.txt'     using 1:($9/1e6)  with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/pset-ll-1k.txt'     using 1:($10/1e6) with linespoints notitle ls 4 lw 3 dt (1,1), \
    '../data/pset-ll-1k.txt'     using 1:($11/1e6) with linespoints notitle ls 5 lw 3 dt (1,1)

set lmargin at screen X+2*(W+M)
set rmargin at screen X+2*(W+M)+W

unset label
set ytics 0.5 offset 0.5,0
set yrange [0:3]
set style textbox opaque noborder fillcolor rgb "white"
set label at first 1,3 front boxed left offset -0.5,0 "3"
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "10%"

plot \
    '../data/pset-ll-1k.txt'     using 1:($12/1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/pset-ll-1k.txt'     using 1:($13/1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/pset-ll-1k.txt'     using 1:($14/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/pset-ll-1k.txt'     using 1:($15/1e6) with linespoints notitle ls 4 lw 3 dt (1,1), \
    '../data/pset-ll-1k.txt'     using 1:($16/1e6) with linespoints notitle ls 5 lw 3 dt (1,1)

# Second row

set lmargin at screen X
set rmargin at screen X+W

set ylabel offset -0.5,0 "Operations ({/Symbol \264}10^6/s)"
set ytics 1 offset 0.5,0
set format y "%g"
set yrange [0:4]

unset label
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "1%"

plot \
    '../data/pset-ll-1k.txt'     using 1:($17/1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/pset-ll-1k.txt'     using 1:($18/1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/pset-ll-1k.txt'     using 1:($19/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/pset-ll-1k.txt'     using 1:($20/1e6) with linespoints notitle ls 4 lw 3 dt (1,1), \
    '../data/pset-ll-1k.txt'     using 1:($21/1e6) with linespoints notitle ls 5 lw 3 dt (1,1)

unset ylabel
set ytics format ""

set lmargin at screen X+(W+M)
set rmargin at screen X+(W+M)+W

unset label
set yrange [0:8]
set style textbox opaque noborder fillcolor rgb "white"
set label at first 1,8 front boxed left offset -0.5,0 "8"
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "0.1%"

plot \
    '../data/pset-ll-1k.txt'     using 1:($22/1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/pset-ll-1k.txt'     using 1:($23/1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/pset-ll-1k.txt'     using 1:($24/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/pset-ll-1k.txt'     using 1:($25/1e6) with linespoints notitle ls 4 lw 3 dt (1,1), \
    '../data/pset-ll-1k.txt'     using 1:($26/1e6) with linespoints notitle ls 5 lw 3 dt (1,1)

set lmargin at screen X+2*(W+M)
set rmargin at screen X+2*(W+M)+W

unset label
set ytics 10 offset 0.5,0
set yrange [0:60]
set style textbox opaque noborder fillcolor rgb "white"
set label at first 1,60 front boxed left offset -0.5,0 "60"
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "0%"

plot \
    '../data/pset-ll-1k.txt'     using 1:($27/1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/pset-ll-1k.txt'     using 1:($28/1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/pset-ll-1k.txt'     using 1:($29/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/pset-ll-1k.txt'     using 1:($30/1e6) with linespoints notitle ls 4 lw 3 dt (1,1), \
    '../data/pset-ll-1k.txt'     using 1:($31/1e6) with linespoints notitle ls 5 lw 3 dt (1,1)


================================================
FILE: graphs/plots/pset-tree-1k.gp
================================================
set term postscript color eps enhanced 22
set output "pset-tree-1k.eps"

set size 0.95,1.12

X=0.1
W=0.26
M=0.025

load "styles.inc"

set tmargin 0
set bmargin 3

set multiplot layout 2,3

unset key

set grid ytics

set xtics ("" 1, "" 2, 4, "" 8, 16, 32, 48, 64) nomirror out offset -0.25,0.5
set label at screen 0.5,0.04 center "Number of threads"
set label at screen 0.5,1.09 center "Persistent red-black tree sets with 10^{3} keys"

#set logscale x
set xrange [1:64]

# First row

set lmargin at screen X
set rmargin at screen X+W

set ylabel offset 1.5,0 "Operations ({/Symbol \264}10^6/s)"
set ytics offset 0.5,0
#set logscale y
#set format y "10^{%T}"
#set yrange [1e4:5e6]
set format y "%g"
set yrange [0:0.6]

set label at graph 0.5,1.075 center font "Helvetica-bold,18" "100%"

set key at graph 0.99,0.99 samplen 1.5

plot \
	'../data/pset-tree-1k.txt'     using 1:($2/1e6)  with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/pset-tree-1k.txt'     using 1:($3/1e6)  with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/pset-tree-1k.txt'     using 1:($4/1e6)  with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/pset-tree-1k.txt'     using 1:($5/1e6)  with linespoints notitle ls 4 lw 3 dt (1,1), \
    '../data/pset-tree-1k.txt'     using 1:($6/1e6)  with linespoints notitle ls 5 lw 3 dt (1,1)

unset ylabel
set ytics format ""

set lmargin at screen X+(W+M)
set rmargin at screen X+(W+M)+W

unset label
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "50%"

plot \
    '../data/pset-tree-1k.txt'     using 1:($7/1e6)  with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/pset-tree-1k.txt'     using 1:($8/1e6)  with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/pset-tree-1k.txt'     using 1:($9/1e6)  with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/pset-tree-1k.txt'     using 1:($10/1e6) with linespoints notitle ls 4 lw 3 dt (1,1), \
    '../data/pset-tree-1k.txt'     using 1:($11/1e6) with linespoints notitle ls 5 lw 3 dt (1,1)

set lmargin at screen X+2*(W+M)
set rmargin at screen X+2*(W+M)+W

unset label
set yrange [0:3]
set style textbox opaque noborder fillcolor rgb "white"
set label at first 1,3 front boxed left offset -0.5,0 "3"
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "10%"

plot \
    '../data/pset-tree-1k.txt'     using 1:($12/1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/pset-tree-1k.txt'     using 1:($13/1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/pset-tree-1k.txt'     using 1:($14/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/pset-tree-1k.txt'     using 1:($15/1e6) with linespoints notitle ls 4 lw 3 dt (1,1), \
    '../data/pset-tree-1k.txt'     using 1:($16/1e6) with linespoints notitle ls 5 lw 3 dt (1,1)

# Second row

set lmargin at screen X
set rmargin at screen X+W

set ylabel offset 0.5,0 "Operations ({/Symbol \264}10^6/s)"
set ytics offset 0.5,0
#set logscale y
#set format y "10^{%T}"
#set yrange [1e6:4e8]
set format y "%g"
set yrange [0:30]

unset label
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "1%"

plot \
    '../data/pset-tree-1k.txt'     using 1:($17/1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/pset-tree-1k.txt'     using 1:($18/1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/pset-tree-1k.txt'     using 1:($19/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/pset-tree-1k.txt'     using 1:($20/1e6) with linespoints notitle ls 4 lw 3 dt (1,1), \
    '../data/pset-tree-1k.txt'     using 1:($21/1e6) with linespoints notitle ls 5 lw 3 dt (1,1)

unset ylabel
set ytics format ""

set lmargin at screen X+(W+M)
set rmargin at screen X+(W+M)+W

unset label
set yrange [0:150]
set style textbox opaque noborder fillcolor rgb "white"
set label at first 1,150 front boxed left offset -0.5,0 "150"
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "0.1%"

plot \
    '../data/pset-tree-1k.txt'     using 1:($22/1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/pset-tree-1k.txt'     using 1:($23/1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/pset-tree-1k.txt'     using 1:($24/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/pset-tree-1k.txt'     using 1:($25/1e6) with linespoints notitle ls 4 lw 3 dt (1,1), \
    '../data/pset-tree-1k.txt'     using 1:($26/1e6) with linespoints notitle ls 5 lw 3 dt (1,1)

set lmargin at screen X+2*(W+M)
set rmargin at screen X+2*(W+M)+W

unset label
set yrange [0:350]
set style textbox opaque noborder fillcolor rgb "white"
set label at first 1,350 front boxed left offset -0.5,0 "350"
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "0%"

plot \
    '../data/pset-tree-1k.txt'     using 1:($27/1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/pset-tree-1k.txt'     using 1:($28/1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/pset-tree-1k.txt'     using 1:($29/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/pset-tree-1k.txt'     using 1:($30/1e6) with linespoints notitle ls 4 lw 3 dt (1,1), \
    '../data/pset-tree-1k.txt'     using 1:($31/1e6) with linespoints notitle ls 5 lw 3 dt (1,1)


================================================
FILE: graphs/plots/pset-tree-1m.gp
================================================
set term postscript color eps enhanced 22
set output "pset-tree-1m.eps"

set size 0.95,1.12

X=0.1
W=0.26
M=0.025

load "styles.inc"

set tmargin 0
set bmargin 3

set multiplot layout 2,3

unset key

set grid ytics

set xtics ("" 1, "" 2, 4, "" 8, 16, 32, 48, 64) nomirror out offset -0.25,0.5
set label at screen 0.5,0.04 center "Number of threads"
set label at screen 0.5,1.09 center "Persistent red-black tree sets with 10^{6} keys"

set xrange [1:64]

# First row

set lmargin at screen X
set rmargin at screen X+W

set ylabel offset 1.5,0 "Operations ({/Symbol \264}10^6/s)"
set ytics 0.1 offset 0.5,0
set format y "%g"
set yrange [0:0.3]

set label at graph 0.5,1.075 center font "Helvetica-bold,18" "100%"

set key at graph 0.99,0.99 samplen 1.5

plot \
    '../data/pset-tree-1m-oflf.txt'   using 1:($2/1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/pset-tree-1m-ofwf.txt'   using 1:($2/1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/pset-tree-1m-romlog.txt' using 1:($2/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/pset-tree-1m-romlr.txt'  using 1:($2/1e6) with linespoints notitle ls 4 lw 3 dt (1,1), \
    '../data/pset-tree-1m-pmdk.txt'   using 1:($2/1e6) with linespoints notitle ls 5 lw 3 dt (1,1)

unset ylabel
set ytics format ""

set lmargin at screen X+(W+M)
set rmargin at screen X+(W+M)+W

unset label
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "50%"

plot \
    '../data/pset-tree-1m-oflf.txt'   using 1:($3/1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/pset-tree-1m-ofwf.txt'   using 1:($3/1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/pset-tree-1m-romlog.txt' using 1:($3/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/pset-tree-1m-romlr.txt'  using 1:($3/1e6) with linespoints notitle ls 4 lw 3 dt (1,1), \
    '../data/pset-tree-1m-pmdk.txt'   using 1:($3/1e6) with linespoints notitle ls 5 lw 3 dt (1,1)

set lmargin at screen X+2*(W+M)
set rmargin at screen X+2*(W+M)+W

unset label
set ytics 0.2 offset 0.5,0
set yrange [0:1.4]
set style textbox opaque noborder fillcolor rgb "white"
set label at first 1,1.4 front boxed left offset -0.5,0 "1.4"
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "10%"

plot \
    '../data/pset-tree-1m-oflf.txt'   using 1:($4/1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/pset-tree-1m-ofwf.txt'   using 1:($4/1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/pset-tree-1m-romlog.txt' using 1:($4/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/pset-tree-1m-romlr.txt'  using 1:($4/1e6) with linespoints notitle ls 4 lw 3 dt (1,1), \
    '../data/pset-tree-1m-pmdk.txt'   using 1:($4/1e6) with linespoints notitle ls 5 lw 3 dt (1,1)


# Second row

set lmargin at screen X
set rmargin at screen X+W

set ylabel offset 0.5,0 "Operations ({/Symbol \264}10^6/s)"
set ytics 5 offset 0.5,0
set format y "%g"
set yrange [0:12]

unset label
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "1%"

plot \
    '../data/pset-tree-1m-oflf.txt'   using 1:($5/1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/pset-tree-1m-ofwf.txt'   using 1:($5/1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/pset-tree-1m-romlog.txt' using 1:($5/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/pset-tree-1m-romlr.txt'  using 1:($5/1e6) with linespoints notitle ls 4 lw 3 dt (1,1), \
    '../data/pset-tree-1m-pmdk.txt'   using 1:($5/1e6) with linespoints notitle ls 5 lw 3 dt (1,1)

unset ylabel
set ytics format ""

set lmargin at screen X+(W+M)
set rmargin at screen X+(W+M)+W

unset label
set ytics 5 offset 0.5,0
set yrange [0:25]
set style textbox opaque noborder fillcolor rgb "white"
set label at first 1,25 front boxed left offset -0.5,0 "25"
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "0.1%"

plot \
    '../data/pset-tree-1m-oflf.txt'   using 1:($6/1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/pset-tree-1m-ofwf.txt'   using 1:($6/1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/pset-tree-1m-romlog.txt' using 1:($6/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/pset-tree-1m-romlr.txt'  using 1:($6/1e6) with linespoints notitle ls 4 lw 3 dt (1,1), \
    '../data/pset-tree-1m-pmdk.txt'   using 1:($6/1e6) with linespoints notitle ls 5 lw 3 dt (1,1)

set lmargin at screen X+2*(W+M)
set rmargin at screen X+2*(W+M)+W

unset label
set ytics 5 offset 0.5,0
set yrange [0:38]
set style textbox opaque noborder fillcolor rgb "white"
set label at first 1,38 front boxed left offset -0.5,0 "38"
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "0%"

plot \
    '../data/pset-tree-1m-oflf.txt'   using 1:($7/1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/pset-tree-1m-ofwf.txt'   using 1:($7/1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/pset-tree-1m-romlog.txt' using 1:($7/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/pset-tree-1m-romlr.txt'  using 1:($7/1e6) with linespoints notitle ls 4 lw 3 dt (1,1), \
    '../data/pset-tree-1m-pmdk.txt'   using 1:($7/1e6) with linespoints notitle ls 5 lw 3 dt (1,1)


unset tics
unset border
unset xlabel
unset ylabel
unset label

#set key at screen 0.92,0.20 samplen 2.0 bottom
#plot [][0:1] \
#    2 with linespoints title 'OF-LF'   ls 1, \
#    2 with linespoints title 'OF-WF'   ls 2, \
#    2 with linespoints title 'RomLog'  ls 3, \
#    2 with linespoints title 'RomLR'   ls 4, \
#    2 with linespoints title 'PMDK'    ls 5       
    

================================================
FILE: graphs/plots/psps-integer.gp
================================================
set term postscript color eps enhanced 22
set output "psps-integer.eps"

set size 0.95,1.12

S1=0.005
S2=0.3
X=0.1+S1
W=0.26
M=0.025

load "styles.inc"

set tmargin 0
set bmargin 3

set multiplot layout 2,3

unset key

set grid ytics

set xtics (1, 2, 4, 8, "" 16, 32, "" 64, 128, "" 256) nomirror out offset -0.25,0.5
set label at screen 0.5,0.04 center "Number of swaps per transaction"
set label at screen 0.5,1.09 center "Persistent SPS integer swap"

set logscale x
set xrange [1:256]

# First row

set lmargin at screen X
set rmargin at screen X+W

set ylabel offset 1.5-S2,0 "Swaps ({/Symbol \264}10^6/s)"
set ytics offset 0.5,0
set yrange [0:14]

set label at graph 0.5,1.075 center font "Helvetica-bold,18" "1 thread"

plot \
	'../data/psps-integer.txt'       using 1:($2 /1e6) with linespoints notitle ls 1 lw 3 dt 1,    \
    '../data/psps-integer.txt'       using 1:($10/1e6) with linespoints notitle ls 2 lw 3 dt 1,    \
    '../data/psps-integer.txt'       using 1:($18/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/psps-integer.txt'       using 1:($26/1e6) with linespoints notitle ls 4 lw 3 dt (1,1), \
    '../data/psps-integer-atlas.txt' using 1:($2 /1e6) with linespoints notitle ls 5 lw 3 dt (1,1)

unset ylabel
set ytics format ""

set lmargin at screen X+(W+M)
set rmargin at screen X+(W+M)+W

unset label
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "2 threads"

plot \
    '../data/psps-integer.txt'       using 1:($3 /1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/psps-integer.txt'       using 1:($11/1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/psps-integer.txt'       using 1:($19/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/psps-integer.txt'       using 1:($27/1e6) with linespoints notitle ls 4 lw 3 dt (1,1), \
    '../data/psps-integer-atlas.txt' using 1:($3 /1e6) with linespoints notitle ls 5 lw 3 dt (1,1)

set lmargin at screen X+2*(W+M)
set rmargin at screen X+2*(W+M)+W

unset label
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "4 threads"

plot \
    '../data/psps-integer.txt'       using 1:($4 /1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/psps-integer.txt'       using 1:($12/1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/psps-integer.txt'       using 1:($20/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/psps-integer.txt'       using 1:($28/1e6) with linespoints notitle ls 4 lw 3 dt (1,1), \
    '../data/psps-integer-atlas.txt' using 1:($4 /1e6) with linespoints notitle ls 5 lw 3 dt (1,1)

# Second row

set lmargin at screen X
set rmargin at screen X+W

set ylabel offset 0.5-S2,0 "Swaps ({/Symbol \264}10^6/s)"
set ytics offset 0.5,0
set ytics format "%g"
set yrange [0:11]

unset label
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "16 threads"

plot \
    '../data/psps-integer.txt'       using 1:($6 /1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/psps-integer.txt'       using 1:($14/1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/psps-integer.txt'       using 1:($22/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/psps-integer.txt'       using 1:($30/1e6) with linespoints notitle ls 4 lw 3 dt (1,1), \
    '../data/psps-integer-atlas.txt' using 1:($6 /1e6) with linespoints notitle ls 5 lw 3 dt (1,1)

unset ylabel
set ytics format ""

set lmargin at screen X+(W+M)
set rmargin at screen X+(W+M)+W

unset label
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "32 threads"

plot \
    '../data/psps-integer.txt'       using 1:($7 /1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/psps-integer.txt'       using 1:($15/1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/psps-integer.txt'       using 1:($23/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/psps-integer.txt'       using 1:($31/1e6) with linespoints notitle ls 4 lw 3 dt (1,1), \
    '../data/psps-integer-atlas.txt' using 1:($7 /1e6) with linespoints notitle ls 5 lw 3 dt (1,1)

set lmargin at screen X+2*(W+M)
set rmargin at screen X+2*(W+M)+W

unset label
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "64 threads"

plot \
    '../data/psps-integer.txt'       using 1:($9 /1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/psps-integer.txt'       using 1:($17/1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/psps-integer.txt'       using 1:($25/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/psps-integer.txt'       using 1:($33/1e6) with linespoints notitle ls 4 lw 3 dt (1,1)#, \
#    '../data/psps-integer-atlas.txt' using 1:($9 /1e6) with linespoints notitle ls 5 lw 3 dt (1,1)


================================================
FILE: graphs/plots/q-array-enq-deq.gp
================================================
set term postscript color eps enhanced 22
set output "q-array-enq-deq.eps"

#set size 0.95,0.6
#X=0.09
#W=0.26
#M=0.02

load "styles.inc"

#set tmargin 11.0
#set bmargin 2.5

set grid ytics

set ylabel offset 0.7,0 "Op/s"
set format y "10^{%T}"
set xtics ("" 1, 2, 4, 8, 16, 32, 64) nomirror out offset -0.25,0.5
#set ytics 100 offset 0.5,0
#set mytics 10
set label at screen 0.5,0.03 center "Number of threads"

set logscale x
#set yrange [1:1e7]
#set lmargin at screen X
#set rmargin at screen X+W

set label at graph 1.6,1.1 center "Queues (array based)"

plot \
	'../data/q-array-enq-deq.txt'      using 1:2  with linespoints title 'OF-LF' ls 1 lw 2 dt 1,     \
    '../data/q-array-enq-deq.txt'      using 1:3  with linespoints title 'OF-WF' ls 2 lw 2 dt 1,     \
    '../data/q-array-enq-deq.txt'      using 1:4  with linespoints title 'ESTM'  ls 3 lw 2 dt (1,1), \
    '../data/q-array-enq-deq-tiny.txt' using 1:2  with linespoints title 'Tiny'  ls 5 lw 2 dt (1,1), \
    '../data/q-array-enq-deq.txt'      using 1:5  with linespoints title 'FAA'   ls 6 lw 2 dt (1,1), \
    '../data/q-array-enq-deq.txt'      using 1:6  with linespoints title 'LCRQ'  ls 7 lw 2 dt (1,1), \


================================================
FILE: graphs/plots/q-enq-deq.gp
================================================
set term postscript color eps enhanced 22
set output "q-enq-deq.eps"

set size 0.95,0.6

X=0.1
W=0.375
M=0.075

load "styles.inc"

set tmargin 10.5
set bmargin 3

set multiplot layout 1,2

unset key

set grid ytics

set xtics (" 1" 1, 2, 4, 8, 16, "32 " 32, 64) nomirror out offset -0.25,0.5
set label at screen 0.5,0.04 center "Number of threads"

set logscale x
set xrange [1:64]

# First row

set lmargin at screen X
set rmargin at screen X+W

set ylabel offset 1.5,0 "Operations ({/Symbol \264}10^6/s)"
set ytics 4 offset 0.5,0
set yrange [0:15]

set label at graph 0.5,1.075 center font "Helvetica-bold,18" "Linked list-based queue"
set key at graph 0.99,0.92 samplen 1.5

# set label at graph 0.12,0.77 center font "Helvetica,18" "MS"
# set arrow from graph 0.12,0.82 to graph 0.12,0.90 size screen 0.015,25 lw 3
# set label at graph 0.55,0.72 left font "Helvetica,18" "SimQ"
# set arrow from graph 0.53,0.72 to graph 0.47,0.72 size screen 0.015,25 lw 3
# set label at graph 0.88,0.60 center font "Helvetica,18" "TurnQ"
# set arrow from graph 0.88,0.55 to graph 0.88,0.47 size screen 0.015,25 lw 3

plot \
    '../data/q-ll-enq-deq.txt'      using 1:($2/1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/q-ll-enq-deq.txt'      using 1:($3/1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/q-ll-enq-deq.txt'      using 1:($4/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/q-ll-enq-deq-tiny.txt' using 1:($2/1e6) with linespoints notitle ls 5 lw 3 dt (1,1), \
    '../data/q-ll-enq-deq.txt'      using 1:($5/1e6) with linespoints title "MS" ls 6 lw 3 dt (1,1), \
    '../data/q-ll-enq-deq.txt'      using 1:($6/1e6) with linespoints title "SimQ" ls 7 lw 3 dt (1,1), \
    '../data/q-ll-enq-deq.txt'      using 1:($7/1e6) with linespoints title "TurnQ" ls 8 lw 3 dt (1,1)

unset ylabel
set ytics 10 offset 0.5,0
set yrange [0:40]

set lmargin at screen X+(W+M)
set rmargin at screen X+(W+M)+W

unset label
unset arrow
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "Array-based queue"
set key at graph 0.99,0.35 samplen 1.5

# set label at graph 0.55,0.61 left font "Helvetica,18" "FAA"
# set arrow from graph 0.53,0.61 to graph 0.47,0.61 size screen 0.015,25 lw 3
# set label at graph 0.37,0.76 right font "Helvetica,18" "LCRQ"
# set arrow from graph 0.39,0.76 to graph 0.45,0.76 size screen 0.015,25 lw 3

plot \
    '../data/q-array-enq-deq.txt'     using 1:($2/1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/q-array-enq-deq.txt'     using 1:($3/1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/q-array-enq-deq.txt'     using 1:($4/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/q-array-enq-deq.txt'     using 1:($5/1e6) with linespoints title "FAA" ls 6 lw 3 dt (1,1), \
    '../data/q-array-enq-deq.txt'     using 1:($6/1e6) with linespoints title "LCRQ" ls 7 lw 3 dt (1,1)


================================================
FILE: graphs/plots/q-ll-enq-deq.gp
================================================
set term postscript color eps enhanced 22
set output "q-ll-enq-deq.eps"

#set size 0.95,0.6
#X=0.09
#W=0.26
#M=0.02

load "styles.inc"

#set tmargin 11.0
#set bmargin 2.5

#set grid ytics

set ylabel offset 0.7,0 "Op/s"
set format y "10^{%T}"
set xtics ("" 1, 2, 4, 8, 16, 32) nomirror out offset -0.25,0.5
#set ytics 100 offset 0.5,0
#set mytics 10
set label at screen 0.5,0.03 center "Number of threads"

set logscale x
#set yrange [1:1e7]
#set lmargin at screen X
#set rmargin at screen X+W

set label at graph 1.6,1.1 center "Queues (linked-list based)"
#unset key

plot \
    '../data/q-ll-enq-deq.txt'      using 1:2  with linespoints title 'OF-LF' ls 1 lw 2 dt 1,     \
    '../data/q-ll-enq-deq.txt'      using 1:3  with linespoints title 'OF-WF' ls 2 lw 2 dt 1,     \
    '../data/q-ll-enq-deq.txt'      using 1:4  with linespoints title 'ESTM'  ls 3 lw 2 dt (1,1), \
    '../data/q-ll-enq-deq-tiny.txt' using 1:2  with linespoints title 'Tiny'  ls 5 lw 2 dt (1,1), \
    '../data/q-ll-enq-deq.txt'      using 1:5  with linespoints title 'MS'    ls 6 lw 2 dt (1,1), \
    '../data/q-ll-enq-deq.txt'      using 1:6  with linespoints title 'SimQ'  ls 7 lw 2 dt (1,1), \
    '../data/q-ll-enq-deq.txt'      using 1:7  with linespoints title 'TurnQ' ls 8 lw 2 dt (1,1)


================================================
FILE: graphs/plots/set-hash-1k.gp
================================================
set term postscript color eps enhanced 22
set output "set-hash-1k.eps"

set size 0.95,1.12

X=0.1
W=0.26
M=0.025

load "styles.inc"

set tmargin 0
set bmargin 3

set multiplot layout 2,3

unset key

set grid ytics

set xtics ("" 1, 2, 4, 8, 16, 32, 64) nomirror out offset -0.25,0.5
set label at screen 0.5,0.04 center "Number of threads"
set label at screen 0.5,1.09 center "Resizable hash table sets with 10^{3} keys"

set logscale x
set xrange [1:64]

# First row

set lmargin at screen X
set rmargin at screen X+W

set ylabel offset 0.5,0 "Operations ({/Symbol \264}10^6/s)"
set ytics 2 offset 0.5,0
set yrange [0:14]

set label at graph 0.5,1.075 center font "Helvetica-bold,18" "100%"

plot \
    '../data/set-hash-1k.txt'      using 1:($2 /1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/set-hash-1k.txt'      using 1:($3 /1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/set-hash-1k.txt'      using 1:($4 /1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/set-hash-1k-tiny.txt' using 1:($2 /1e6) with linespoints notitle ls 5 lw 3 dt (1,1)

unset ylabel
set ytics format ""

set lmargin at screen X+(W+M)
set rmargin at screen X+(W+M)+W

unset label
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "50%"

set ytics 2 offset 0.5,0
set yrange [0:14]

set style textbox opaque noborder fillcolor rgb "white"
set label at first 1,14 front boxed left offset -0.5,0 "14"

plot \
    '../data/set-hash-1k.txt'      using 1:($5 /1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/set-hash-1k.txt'      using 1:($6 /1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/set-hash-1k.txt'      using 1:($7 /1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/set-hash-1k-tiny.txt' using 1:($3 /1e6) with linespoints notitle ls 5 lw 3 dt (1,1)

set lmargin at screen X+2*(W+M)
set rmargin at screen X+2*(W+M)+W

unset label
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "10%"

set ytics 2 offset 0.5,0
set yrange [0:20]

set style textbox opaque noborder fillcolor rgb "white"
set label at first 1,20 front boxed left offset -0.5,0 "20"

plot \
    '../data/set-hash-1k.txt'      using 1:($8 /1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/set-hash-1k.txt'      using 1:($9 /1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/set-hash-1k.txt'      using 1:($10/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/set-hash-1k-tiny.txt' using 1:($4 /1e6) with linespoints notitle ls 5 lw 3 dt (1,1)

# Second row

set lmargin at screen X
set rmargin at screen X+W

set ylabel offset 0.5,0 "Operations ({/Symbol \264}10^6/s)"
set ytics 50 offset 0.5,0
set ytics format "%g"
set yrange [0:100]

unset label
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "1%"

plot \
    '../data/set-hash-1k.txt'      using 1:($11/1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/set-hash-1k.txt'      using 1:($12/1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/set-hash-1k.txt'      using 1:($13/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/set-hash-1k-tiny.txt' using 1:($5 /1e6) with linespoints notitle ls 5 lw 3 dt (1,1)

unset ylabel
set ytics format ""

set lmargin at screen X+(W+M)
set rmargin at screen X+(W+M)+W

unset label
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "0.1%"

set ytics 50 offset 0.5,0
set yrange [0:220]

set style textbox opaque noborder fillcolor rgb "white"
set label at first 1,220 front boxed left offset -0.5,0 "220"

plot \
    '../data/set-hash-1k.txt'      using 1:($14/1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/set-hash-1k.txt'      using 1:($15/1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/set-hash-1k.txt'      using 1:($16/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/set-hash-1k-tiny.txt' using 1:($6 /1e6) with linespoints notitle ls 5 lw 3 dt (1,1)

set lmargin at screen X+2*(W+M)
set rmargin at screen X+2*(W+M)+W

unset label
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "0%"

set ytics 50 offset 0.5,0
set yrange [0:550]

set style textbox opaque noborder fillcolor rgb "white"
set label at first 1,550 front boxed left offset -0.5,0 "550"

plot \
    '../data/set-hash-1k.txt'      using 1:($17/1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/set-hash-1k.txt'      using 1:($18/1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/set-hash-1k.txt'      using 1:($19/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/set-hash-1k-tiny.txt' using 1:($7 /1e6) with linespoints notitle ls 5 lw 3 dt (1,1)


================================================
FILE: graphs/plots/set-ll-10k.gp
================================================
set term postscript color eps enhanced 22
set output "set-ll-10k.eps"

set size 0.95,1.08

X=0.09
W=0.26
M=0.02

load "styles.inc"

#set tmargin 11.0
#set bmargin 2.5

set grid ytics

set ylabel offset 0.7,0 "Op/s"
set format y "10^{%T}"
set xtics ("" 1, 2, 4, 8, 16, 32) nomirror out offset -0.25,0.5
set ytics 100 offset 0.5,0
set mytics 10
set label at screen 0.5,0.03 center "Number of threads"
set label at screen 0.5,1.05 center "Linked List Sets with 10^{4} keys"

set multiplot layout 2,3

set logscale x
set xrange [1:32]

set lmargin at screen X
set rmargin at screen X+W

unset key
set label at graph 0.5,1.1 center "100%"

plot \
	'../data/set-ll-10k.txt'      using 1:2  with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/set-ll-10k.txt'      using 1:3  with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/set-ll-10k.txt'      using 1:4  with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/set-ll-10k-tiny.txt' using 1:2  with linespoints notitle ls 5 lw 3 dt (1,1), \
    '../data/set-ll-10k.txt'      using 1:5  with linespoints notitle ls 6 lw 3 dt (1,1), \
    '../data/set-ll-10k.txt'      using 1:6  with linespoints notitle ls 7 lw 3 dt (1,1)

unset ylabel
set ytics format ""

set lmargin at screen X+(W+M)
set rmargin at screen X+(W+M)+W

unset label
set label at graph 0.5,1.1 center "50%"

plot \
    '../data/set-ll-10k.txt'      using 1:7  with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/set-ll-10k.txt'      using 1:8  with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/set-ll-10k.txt'      using 1:9  with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/set-ll-10k-tiny.txt' using 1:3  with linespoints notitle ls 5 lw 3 dt (1,1), \
    '../data/set-ll-10k.txt'      using 1:10 with linespoints notitle ls 6 lw 3 dt (1,1), \
    '../data/set-ll-10k.txt'      using 1:11 with linespoints notitle ls 7 lw 3 dt (1,1)

unset ylabel
set ytics format ""

set lmargin at screen X+2*(W+M)
set rmargin at screen X+2*(W+M)+W

unset label
set label at graph 0.5,1.1 center "10%"

plot \
    '../data/set-ll-10k.txt'      using 1:12 with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/set-ll-10k.txt'      using 1:13 with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/set-ll-10k.txt'      using 1:14 with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/set-ll-10k-tl2.txt'  using 1:4  with linespoints notitle ls 4 lw 3 dt (1,1), \
    '../data/set-ll-10k-tiny.txt' using 1:4  with linespoints notitle ls 5 lw 3 dt (1,1), \
    '../data/set-ll-10k.txt'      using 1:15 with linespoints notitle ls 6 lw 3 dt (1,1), \
    '../data/set-ll-10k.txt'      using 1:16 with linespoints notitle ls 7 lw 3 dt (1,1)


# Second row


set logscale x
#set yrange [1:1e7]
set lmargin at screen X
set rmargin at screen X+W

unset label
set label at graph 0.5,1.1 center "1%"

plot \
    '../data/set-ll-10k.txt'      using 1:17 with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/set-ll-10k.txt'      using 1:18 with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/set-ll-10k.txt'      using 1:19 with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/set-ll-10k-tiny.txt' using 1:5  with linespoints notitle ls 5 lw 3 dt (1,1), \
    '../data/set-ll-10k.txt'      using 1:20 with linespoints notitle ls 6 lw 3 dt (1,1), \
    '../data/set-ll-10k.txt'      using 1:21 with linespoints notitle ls 7 lw 3 dt (1,1)

unset ylabel
set ytics format ""

set lmargin at screen X+(W+M)
set rmargin at screen X+(W+M)+W

unset label
set label at graph 0.5,1.1 center "0.1%"

plot \
    '../data/set-ll-10k.txt'      using 1:22 with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/set-ll-10k.txt'      using 1:23 with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/set-ll-10k.txt'      using 1:24 with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/set-ll-10k-tiny.txt' using 1:6  with linespoints notitle ls 5 lw 3 dt (1,1), \
    '../data/set-ll-10k.txt'      using 1:25 with linespoints notitle ls 6 lw 3 dt (1,1), \
    '../data/set-ll-10k.txt'      using 1:26 with linespoints notitle ls 7 lw 3 dt (1,1)

    
unset ylabel
set ytics format ""

set lmargin at screen X+2*(W+M)
set rmargin at screen X+2*(W+M)+W

unset label
set label at graph 0.5,1.1 center "0%"

plot \
    '../data/set-ll-10k.txt'      using 1:27 with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/set-ll-10k.txt'      using 1:28 with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/set-ll-10k.txt'      using 1:29 with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/set-ll-10k-tiny.txt' using 1:7  with linespoints notitle ls 5 lw 3 dt (1,1), \
    '../data/set-ll-10k.txt'      using 1:30 with linespoints notitle ls 6 lw 3 dt (1,1), \
    '../data/set-ll-10k.txt'      using 1:31 with linespoints notitle ls 7 lw 3 dt (1,1)


unset tics
unset border
unset xlabel
unset ylabel
unset label

set key at screen 0.62,0.22 samplen 2.0 bottom
plot [][0:1] \
    2 with linespoints title 'OF-LF'        ls 1, \
    2 with linespoints title 'OF-WF'        ls 2, \
    2 with linespoints title 'ESTM'         ls 3, \
    2 with linespoints title 'Tiny'         ls 5

set key at screen 0.88,0.28 samplen 2.0 bottom
plot [][0:1] \
    2 with linespoints title 'HarrisHP'     ls 6, \
    2 with linespoints title 'HarrisHE'     ls 7


unset multiplot


================================================
FILE: graphs/plots/set-ll-1k.gp
================================================
set term postscript color eps enhanced 22
set output "set-ll-1k.eps"

set size 0.95,1.12

X=0.1
W=0.26
M=0.025

load "styles.inc"

set tmargin 0
set bmargin 3

set multiplot layout 2,3

unset key

set grid ytics

set xtics ("" 1, "" 2, 4, "" 8, 16, 32, 48, 64) nomirror out offset -0.25,0.5
set label at screen 0.5,0.04 center "Number of threads"
set label at screen 0.5,1.09 center "Linked list sets with 10^{3} keys"

#set logscale x
set xrange [1:64]

# First row

set lmargin at screen X
set rmargin at screen X+W

set ylabel offset 1.5,0 "Operations ({/Symbol \264}10^6/s)"
set ytics 1 offset 0.5,0
set yrange [0:3]

set label at graph 0.5,1.075 center font "Helvetica-bold,18" "100%"

set key at graph 0.99,0.99 samplen 1.5

plot \
    '../data/set-ll-1k.txt'      using 1:($2 /1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/set-ll-1k.txt'      using 1:($3 /1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/set-ll-1k.txt'      using 1:($4 /1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/set-ll-1k-tiny.txt' using 1:($2 /1e6) with linespoints notitle ls 5 lw 3 dt (1,1), \
    '../data/set-ll-1k.txt'      using 1:($5 /1e6) with linespoints title "HarrisHP" ls 6 lw 3 dt (1,1), \
    '../data/set-ll-1k.txt'      using 1:($6 /1e6) with linespoints notitle ls 7 lw 3 dt (1,1)

unset ylabel
set ytics format ""

set lmargin at screen X+(W+M)
set rmargin at screen X+(W+M)+W

unset label
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "50%"

set ytics 1 offset 0.5,0
set yrange [0:4]
set style textbox opaque noborder fillcolor rgb "white"
set label at first 1,4 front boxed left offset -0.5,0 "4"

plot \
    '../data/set-ll-1k.txt'      using 1:($7 /1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/set-ll-1k.txt'      using 1:($8 /1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/set-ll-1k.txt'      using 1:($9 /1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/set-ll-1k-tiny.txt' using 1:($3 /1e6) with linespoints notitle ls 5 lw 3 dt (1,1), \
    '../data/set-ll-1k.txt'      using 1:($10/1e6) with linespoints notitle ls 6 lw 3 dt (1,1), \
    '../data/set-ll-1k.txt'      using 1:($11/1e6) with linespoints title "HarrisHE" ls 7 lw 3 dt (1,1)

set lmargin at screen X+2*(W+M)
set rmargin at screen X+2*(W+M)+W

unset label
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "10%"

set ytics 1 offset 0.5,0
set yrange [0:5]
set style textbox opaque noborder fillcolor rgb "white"
set label at first 1,5 front boxed left offset -0.5,0 "5"

plot \
    '../data/set-ll-1k.txt'      using 1:($12/1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/set-ll-1k.txt'      using 1:($13/1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/set-ll-1k.txt'      using 1:($14/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/set-ll-1k-tiny.txt' using 1:($3 /1e6) with linespoints notitle ls 5 lw 3 dt (1,1), \
    '../data/set-ll-1k.txt'      using 1:($15/1e6) with linespoints notitle ls 6 lw 3 dt (1,1), \
    '../data/set-ll-1k.txt'      using 1:($16/1e6) with linespoints notitle ls 7 lw 3 dt (1,1)

#
# Second row
#

set lmargin at screen X
set rmargin at screen X+W

set ylabel offset 0.5,0 "Operations ({/Symbol \264}10^6/s)"
set ytics 2 offset 0.5,0
set ytics 2 format "%g"
set yrange [0:7]

unset label
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "1%"

plot \
    '../data/set-ll-1k.txt'      using 1:($17/1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/set-ll-1k.txt'      using 1:($18/1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/set-ll-1k.txt'      using 1:($19/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/set-ll-1k-tiny.txt' using 1:($5 /1e6) with linespoints notitle ls 5 lw 3 dt (1,1), \
    '../data/set-ll-1k.txt'      using 1:($20/1e6) with linespoints notitle ls 6 lw 3 dt (1,1), \
    '../data/set-ll-1k.txt'      using 1:($21/1e6) with linespoints notitle ls 7 lw 3 dt (1,1)

unset ylabel
set ytics format ""

set lmargin at screen X+(W+M)
set rmargin at screen X+(W+M)+W

unset label
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "0.1%"

set ytics 2 offset 0.5,0
set yrange [0:9]
set style textbox opaque noborder fillcolor rgb "white"
set label at first 1,9 front boxed left offset -0.5,0 "9"

plot \
    '../data/set-ll-1k.txt'      using 1:($22/1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/set-ll-1k.txt'      using 1:($23/1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/set-ll-1k.txt'      using 1:($24/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/set-ll-1k-tiny.txt' using 1:($6 /1e6) with linespoints notitle ls 5 lw 3 dt (1,1), \
    '../data/set-ll-1k.txt'      using 1:($25/1e6) with linespoints notitle ls 6 lw 3 dt (1,1), \
    '../data/set-ll-1k.txt'      using 1:($26/1e6) with linespoints notitle ls 7 lw 3 dt (1,1)

set lmargin at screen X+2*(W+M)
set rmargin at screen X+2*(W+M)+W

unset label
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "0%"

set ytics 2 offset 0.5,0
set yrange [0:17]
set style textbox opaque noborder fillcolor rgb "white"
set label at first 1,17 front boxed left offset -0.5,0 "17"

plot \
    '../data/set-ll-1k.txt'      using 1:($27/1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/set-ll-1k.txt'      using 1:($28/1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/set-ll-1k.txt'      using 1:($29/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/set-ll-1k-tiny.txt' using 1:($7 /1e6) with linespoints notitle ls 5 lw 3 dt (1,1), \
    '../data/set-ll-1k.txt'      using 1:($30/1e6) with linespoints notitle ls 6 lw 3 dt (1,1), \
    '../data/set-ll-1k.txt'      using 1:($31/1e6) with linespoints notitle ls 7 lw 3 dt (1,1)


================================================
FILE: graphs/plots/set-tree-10k.gp
================================================
set term postscript color eps enhanced 22
set output "set-tree-10k.eps"

set size 0.95,1.12

X=0.1
W=0.26
M=0.025

load "styles.inc"

set tmargin 0
set bmargin 3

set multiplot layout 2,3

unset key

set grid ytics

set xtics ("" 1, "" 2, 4, "" 8, 16, 32, 48, 64) nomirror out offset -0.25,0.5
set label at screen 0.5,0.04 center "Number of threads"
set label at screen 0.5,1.09 center "Red-black tree sets with 10^{4} keys"

#set logscale x
set xrange [1:64]

# First row

set lmargin at screen X
set rmargin at screen X+W

set ylabel offset 2.5,0 "Operations ({/Symbol \264}10^6/s)"
set ytics 0.1 offset 0.5,0
set yrange [0:0.4]

set label at graph 0.5,1.075 center font "Helvetica-bold,18" "100%"
set key at graph 0.99,0.99 samplen 1.5

plot \
    '../data/set-tree-10k.txt'      using 1:($2 /1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/set-tree-10k.txt'      using 1:($3 /1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/set-tree-10k.txt'      using 1:($4 /1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/set-tree-10k.txt'      using 1:($5 /1e6) with linespoints title "NataHE" ls 9 lw 3 dt (1,1), \
    '../data/set-tree-10k-tiny.txt' using 1:($2 /1e6) with linespoints notitle ls 5 lw 3 dt (1,1)

unset ylabel
set ytics format ""

set lmargin at screen X+(W+M)
set rmargin at screen X+(W+M)+W

unset label
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "50%"

set ytics 0.1 offset 0.5,0
set yrange [0:1.0]

set style textbox opaque noborder fillcolor rgb "white"
set label at first 1,1.0 front boxed left offset -0.5,0 "1.0"

plot \
    '../data/set-tree-10k.txt'      using 1:($6 /1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/set-tree-10k.txt'      using 1:($7 /1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/set-tree-10k.txt'      using 1:($8 /1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/set-tree-10k.txt'      using 1:($9 /1e6) with linespoints notitle ls 9 lw 3 dt (1,1), \
    '../data/set-tree-10k-tiny.txt' using 1:($3 /1e6) with linespoints notitle ls 5 lw 3 dt (1,1)

set lmargin at screen X+2*(W+M)
set rmargin at screen X+2*(W+M)+W

unset label
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "10%"

set ytics 0.5 offset 0.5,0
set yrange [0:2.0]

set style textbox opaque noborder fillcolor rgb "white"
set label at first 1,2.0 front boxed left offset -0.5,0 "2.0"

plot \
    '../data/set-tree-10k.txt'      using 1:($10/1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/set-tree-10k.txt'      using 1:($11/1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/set-tree-10k.txt'      using 1:($12/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/set-tree-10k.txt'      using 1:($13/1e6) with linespoints notitle ls 9 lw 3 dt (1,1), \
    '../data/set-tree-10k-tiny.txt' using 1:($4 /1e6) with linespoints notitle ls 5 lw 3 dt (1,1)

# Second row

set lmargin at screen X
set rmargin at screen X+W

set ylabel offset 0.5,0 "Operations ({/Symbol \264}10^6/s)"
set ytics 5 offset 0.5,0
set ytics format "%g"
set yrange [0:16]

unset label
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "1%"

plot \
    '../data/set-tree-10k.txt'      using 1:($14/1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/set-tree-10k.txt'      using 1:($15/1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/set-tree-10k.txt'      using 1:($16/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/set-tree-10k.txt'      using 1:($17/1e6) with linespoints notitle ls 9 lw 3 dt (1,1), \
    '../data/set-tree-10k-tiny.txt' using 1:($5 /1e6) with linespoints notitle ls 5 lw 3 dt (1,1)

unset ylabel
set ytics format ""

set lmargin at screen X+(W+M)
set rmargin at screen X+(W+M)+W

unset label
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "0.1%"

set ytics 10 offset 0.5,0
set yrange [0:80]
set style textbox opaque noborder fillcolor rgb "white"
set label at first 1,80 front boxed left offset -0.5,0 "80"

plot \
    '../data/set-tree-10k.txt'      using 1:($18/1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/set-tree-10k.txt'      using 1:($19/1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/set-tree-10k.txt'      using 1:($20/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/set-tree-10k.txt'      using 1:($21/1e6) with linespoints notitle ls 9 lw 3 dt (1,1), \
    '../data/set-tree-10k-tiny.txt' using 1:($6 /1e6) with linespoints notitle ls 5 lw 3 dt (1,1)

set lmargin at screen X+2*(W+M)
set rmargin at screen X+2*(W+M)+W

unset label
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "0%"

set ytics 20 offset 0.5,0
set yrange [0:140]
set style textbox opaque noborder fillcolor rgb "white"
set label at first 1,140 front boxed left offset -0.5,0 "140"

plot \
    '../data/set-tree-10k.txt'      using 1:($22/1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/set-tree-10k.txt'      using 1:($23/1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/set-tree-10k.txt'      using 1:($24/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/set-tree-10k.txt'      using 1:($25/1e6) with linespoints notitle ls 9 lw 3 dt (1,1), \
    '../data/set-tree-10k-tiny.txt' using 1:($7 /1e6) with linespoints notitle ls 5 lw 3 dt (1,1)


================================================
FILE: graphs/plots/set-tree-1k.gp
================================================
set term postscript color eps enhanced 22
set output "set-tree-1k.eps"

set size 0.95,1.12

X=0.1
W=0.26
M=0.025

load "styles.inc"

set tmargin 0
set bmargin 3

set multiplot layout 2,3

unset key

set grid ytics

set xtics ("" 1, "" 2, 4, "" 8, 16, 32, 48, 64) nomirror out offset -0.25,0.5
set label at screen 0.5,0.04 center "Number of threads"
set label at screen 0.5,1.09 center "Red-black tree sets with 10^{3} keys"

#set logscale x
set xrange [1:64]

# First row

set lmargin at screen X
set rmargin at screen X+W

set ylabel offset 2.5,0 "Operations ({/Symbol \264}10^6/s)"
set ytics 0.5 offset 0.5,0
set yrange [0:2]

set label at graph 0.5,1.075 center font "Helvetica-bold,18" "100%"
set key at graph 0.99,0.99 samplen 1.5

plot \
    '../data/set-tree-1k.txt'      using 1:($2 /1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/set-tree-1k.txt'      using 1:($3 /1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/set-tree-1k.txt'      using 1:($4 /1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/set-tree-1k.txt'      using 1:($5 /1e6) with linespoints title "NatarajanHE" ls 9 lw 3 dt (1,1), \
    '../data/set-tree-1k-tiny.txt' using 1:($2 /1e6) with linespoints notitle ls 5 lw 3 dt (1,1)

unset ylabel
set ytics format ""

set lmargin at screen X+(W+M)
set rmargin at screen X+(W+M)+W

unset label
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "50%"

set ytics 0.5 offset 0.5,0
set yrange [0:2.6]

set style textbox opaque noborder fillcolor rgb "white"
set label at first 1,2.6 front boxed left offset -0.5,0 "2.6"

plot \
    '../data/set-tree-1k.txt'      using 1:($6 /1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/set-tree-1k.txt'      using 1:($7 /1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/set-tree-1k.txt'      using 1:($8 /1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/set-tree-1k.txt'      using 1:($9 /1e6) with linespoints notitle ls 9 lw 3 dt (1,1), \
    '../data/set-tree-1k-tiny.txt' using 1:($3 /1e6) with linespoints notitle ls 5 lw 3 dt (1,1)

set lmargin at screen X+2*(W+M)
set rmargin at screen X+2*(W+M)+W

unset label
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "10%"

set ytics 0.5 offset 0.5,0
set yrange [0:8]

set style textbox opaque noborder fillcolor rgb "white"
set label at first 1,8 front boxed left offset -0.5,0 "8"

plot \
    '../data/set-tree-1k.txt'      using 1:($10/1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/set-tree-1k.txt'      using 1:($11/1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/set-tree-1k.txt'      using 1:($12/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/set-tree-1k.txt'      using 1:($13/1e6) with linespoints notitle ls 9 lw 3 dt (1,1), \
    '../data/set-tree-1k-tiny.txt' using 1:($4 /1e6) with linespoints notitle ls 5 lw 3 dt (1,1)

# Second row

set lmargin at screen X
set rmargin at screen X+W

set ylabel offset 0.5,0 "Operations ({/Symbol \264}10^6/s)"
set ytics 10 offset 0.5,0
set ytics format "%g"
set yrange [0:16]

unset label
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "1%"

plot \
    '../data/set-tree-1k.txt'      using 1:($14/1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/set-tree-1k.txt'      using 1:($15/1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/set-tree-1k.txt'      using 1:($16/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/set-tree-1k.txt'      using 1:($17/1e6) with linespoints notitle ls 9 lw 3 dt (1,1), \
    '../data/set-tree-1k-tiny.txt' using 1:($5 /1e6) with linespoints notitle ls 5 lw 3 dt (1,1)

unset ylabel
set ytics format ""

set lmargin at screen X+(W+M)
set rmargin at screen X+(W+M)+W

unset label
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "0.1%"

set ytics 10 offset 0.5,0
set yrange [0:100]
set style textbox opaque noborder fillcolor rgb "white"
set label at first 1,100 front boxed left offset -0.5,0 "100"

plot \
    '../data/set-tree-1k.txt'      using 1:($18/1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/set-tree-1k.txt'      using 1:($19/1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/set-tree-1k.txt'      using 1:($20/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/set-tree-1k.txt'      using 1:($21/1e6) with linespoints notitle ls 9 lw 3 dt (1,1), \
    '../data/set-tree-1k-tiny.txt' using 1:($6 /1e6) with linespoints notitle ls 5 lw 3 dt (1,1)

set lmargin at screen X+2*(W+M)
set rmargin at screen X+2*(W+M)+W

unset label
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "0%"

set ytics 10 offset 0.5,0
set yrange [0:210]
set style textbox opaque noborder fillcolor rgb "white"
set label at first 1,210 front boxed left offset -0.5,0 "210"

plot \
    '../data/set-tree-1k.txt'      using 1:($22/1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/set-tree-1k.txt'      using 1:($23/1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/set-tree-1k.txt'      using 1:($24/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/set-tree-1k.txt'      using 1:($25/1e6) with linespoints notitle ls 9 lw 3 dt (1,1), \
    '../data/set-tree-1k-tiny.txt' using 1:($7 /1e6) with linespoints notitle ls 5 lw 3 dt (1,1)


================================================
FILE: graphs/plots/sps-integer.gp
================================================
# this is for an array with 1000 entries
set term postscript color eps enhanced 22
set output "sps-integer.eps"

set size 0.95,1.12

X=0.1
W=0.26
M=0.025

load "styles.inc"

set tmargin 0
set bmargin 3

set multiplot layout 2,3

unset key

set grid ytics

set xtics (1, 2, 4, 8, "" 16, 32, "" 64, 128, "" 256) nomirror out offset -0.25,0.5
set label at screen 0.5,0.04 center "Number of swaps per transaction"
set label at screen 0.5,1.09 center "SPS integer swap"

set logscale x
set xrange [1:256]

# First row

set lmargin at screen X
set rmargin at screen X+W

set ylabel offset 1.5,0 "Swaps ({/Symbol \264}10^6/s)"
set ytics offset 0.5,0
set yrange [0:32]

set label at graph 0.5,1.075 center font "Helvetica-bold,18" "1 thread"

plot \
	'../data/sps-integer.txt'      using 1:($2 /1e6) with linespoints notitle ls 1 lw 3 dt 1,    \
    '../data/sps-integer.txt'      using 1:($10/1e6) with linespoints notitle ls 2 lw 3 dt 1,    \
    '../data/sps-integer.txt'      using 1:($18/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/sps-integer-tiny.txt' using 1:($2 /1e6) with linespoints notitle ls 5 lw 3 dt (1,1)

unset ylabel
set ytics format ""

set lmargin at screen X+(W+M)
set rmargin at screen X+(W+M)+W

unset label
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "2 threads"

plot \
    '../data/sps-integer.txt'      using 1:($3 /1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/sps-integer.txt'      using 1:($11/1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/sps-integer.txt'      using 1:($19/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/sps-integer-tiny.txt' using 1:($3 /1e6) with linespoints notitle ls 5 lw 3 dt (1,1)

set lmargin at screen X+2*(W+M)
set rmargin at screen X+2*(W+M)+W

unset label
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "4 threads"

plot \
    '../data/sps-integer.txt'      using 1:($4 /1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/sps-integer.txt'      using 1:($12/1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/sps-integer.txt'      using 1:($20/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/sps-integer-tiny.txt' using 1:($4 /1e6) with linespoints notitle ls 5 lw 3 dt (1,1)

# Second row

set lmargin at screen X
set rmargin at screen X+W

set ylabel offset 0.5,0 "Swaps ({/Symbol \264}10^6/s)"
set ytics offset 0.5,0
set ytics format "%g"
set yrange [0:13]

unset label
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "16 threads"

plot \
    '../data/sps-integer.txt'      using 1:($6 /1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/sps-integer.txt'      using 1:($14/1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/sps-integer.txt'      using 1:($22/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/sps-integer-tiny.txt' using 1:($6 /1e6) with linespoints notitle ls 5 lw 3 dt (1,1)

unset ylabel
set ytics format ""

set lmargin at screen X+(W+M)
set rmargin at screen X+(W+M)+W

unset label
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "32 threads"

plot \
    '../data/sps-integer.txt'      using 1:($7 /1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/sps-integer.txt'      using 1:($15/1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/sps-integer.txt'      using 1:($23/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/sps-integer-tiny.txt' using 1:($7 /1e6) with linespoints notitle ls 5 lw 3 dt (1,1)

set lmargin at screen X+2*(W+M)
set rmargin at screen X+2*(W+M)+W

unset label
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "64 threads"

plot \
    '../data/sps-integer.txt'      using 1:($9 /1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/sps-integer.txt'      using 1:($17/1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/sps-integer.txt'      using 1:($25/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/sps-integer-tiny.txt' using 1:($9 /1e6) with linespoints notitle ls 5 lw 3 dt (1,1)


================================================
FILE: graphs/plots/sps-object.gp
================================================
set term postscript color eps enhanced 22
set output "sps-object.eps"

set size 0.95,1.12

X=0.1
W=0.26
M=0.025

load "styles.inc"

set tmargin 0
set bmargin 3

set multiplot layout 2,3

unset key

set grid ytics

set xtics (1, 2, 4, 8, "" 16, 32, "" 64, 128, "" 256) nomirror out offset -0.25,0.5
set label at screen 0.5,0.04 center "Number of swaps per transaction"
set label at screen 0.5,1.09 center "SPS object swap"

set logscale x
set xrange [1:256]

# First row

set lmargin at screen X
set rmargin at screen X+W

set ylabel offset 1.5,0 "Swaps ({/Symbol \264}10^6/s)"
set ytics offset 0.5,0
set yrange [0:12]

set label at graph 0.5,1.075 center font "Helvetica-bold,18" "1 thread"

plot \
    '../data/sps-object.txt'      using 1:($2 /1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/sps-object.txt'      using 1:($8 /1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/sps-object.txt'      using 1:($14/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/sps-object-tiny.txt' using 1:($2 /1e6) with linespoints notitle ls 5 lw 3 dt (1,1)

unset ylabel
set ytics format ""

set lmargin at screen X+(W+M)
set rmargin at screen X+(W+M)+W

unset label
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "2 threads"

plot \
    '../data/sps-object.txt'      using 1:($3 /1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/sps-object.txt'      using 1:($9 /1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/sps-object.txt'      using 1:($15/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/sps-object-tiny.txt' using 1:($3 /1e6) with linespoints notitle ls 5 lw 3 dt (1,1)


unset ylabel
set ytics format ""

set lmargin at screen X+2*(W+M)
set rmargin at screen X+2*(W+M)+W

unset label
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "4 threads"

plot \
    '../data/sps-object.txt'      using 1:($4 /1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/sps-object.txt'      using 1:($10/1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/sps-object.txt'      using 1:($16/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/sps-object-tiny.txt' using 1:($4 /1e6) with linespoints notitle ls 5 lw 3 dt (1,1)
 
# Second row

set lmargin at screen X
set rmargin at screen X+W

set ylabel offset 0.5,0 "Swaps ({/Symbol \264}10^6/s)"
set ytics offset 0.5,0
set ytics format "%g"
set yrange [0:12]

unset label
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "8 threads"

plot \
    '../data/sps-object.txt'      using 1:($5 /1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/sps-object.txt'      using 1:($11/1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/sps-object.txt'      using 1:($17/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/sps-object-tiny.txt' using 1:($5 /1e6) with linespoints notitle ls 5 lw 3 dt (1,1)

unset ylabel
set ytics format ""

set lmargin at screen X+(W+M)
set rmargin at screen X+(W+M)+W

unset label
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "16 threads"

plot \
    '../data/sps-object.txt'      using 1:($6 /1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/sps-object.txt'      using 1:($12/1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/sps-object.txt'      using 1:($18/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/sps-object-tiny.txt' using 1:($6 /1e6) with linespoints notitle ls 5 lw 3 dt (1,1)

set lmargin at screen X+2*(W+M)
set rmargin at screen X+2*(W+M)+W

unset label
set label at graph 0.5,1.075 center font "Helvetica-bold,18" "32 threads"

plot \
    '../data/sps-object.txt'      using 1:($7 /1e6) with linespoints notitle ls 1 lw 3 dt 1,     \
    '../data/sps-object.txt'      using 1:($13/1e6) with linespoints notitle ls 2 lw 3 dt 1,     \
    '../data/sps-object.txt'      using 1:($19/1e6) with linespoints notitle ls 3 lw 3 dt (1,1), \
    '../data/sps-object-tiny.txt' using 1:($7 /1e6) with linespoints notitle ls 5 lw 3 dt (1,1)


================================================
FILE: graphs/plots/stress-multi-process-q.gp
================================================
set term postscript color eps enhanced 22
set output "stress-multi-process-q.eps"

set size 0.95,0.6

S=0.2125
X=0.1
W=0.375
M=0.075

load "styles.inc"

set tmargin 10.5
set bmargin 3

# We can fit a second graph if need be (remove S "hack")
set multiplot layout 1,2

unset key

set grid ytics

set xtics (2, 4, 8, 16, 32) nomirror out offset -0.25,0.5
set label at screen 0.5,0.04 center "Number of processes"

set logscale x
set xrange [2:32]

# First row

set lmargin at screen S+X
set rmargin at screen S+X+W

set ylabel offset 1.5,0 "Transactions ({/Symbol \264}10^6/s)"
set ytics 1 offset 0.5,0
set yrange [0:2]

set label at graph 0.5,1.075 center font "Helvetica-bold,18" "Swapping item from q to q"
set key at graph 0.99,0.99 samplen 1.5

# set label at graph 0.14,0.89 center font "Helvetica,18" "FHMP"
# set arrow from graph 0.14,0.84 to graph 0.14,0.76 size screen 0.015,25 lw 3

# we divide by 1e8 because the data is in total number of tx and we want tx/second
plot \
    '../data/stress-multi-process-q-nokills.txt' using 1:($2/1e8) with linespoints title "No kill"            ls 1 lw 2 dt (1,1), \
    '../data/stress-multi-process-q-kills.txt'   using 1:($2/1e8) with linespoints title "1 kill every 100ms" ls 2 lw 2 dt 1


================================================
FILE: graphs/plots/styles.inc
================================================
set pointsize 2

# BLUE   -> #0066CC (light: #0099FF - dark: #003366)
# RED    -> #993333
# GREEN  -> #669900 (light: #00CC66)
# PURPLE -> #663399
# ORANGE -> #CC6633 (light: #FFCC00)

# OF-LF
set style line 1 lt 1 lc rgb "#0099FF" lw 2 pt 9  ps 1.75 dt 1
# OF-WF
set style line 2 lt 1 lc rgb "#0033CC" lw 2 pt 11 ps 1.75 dt 1
# ESTM
set style line 3 lt 1 lc rgb "#00CC66" lw 2 pt 13 ps 1.5 dt 1
# TL2
set style line 4 lt 1 lc rgb "#993333" lw 2 pt 5  ps 1.5 dt 1
# Tiny STM
set style line 5 lt 1 lc rgb "#333333" lw 2 pt 7  ps 1.5 dt 1
# Custom 1
set style line 6 lt 1 lc rgb "#CC6633" lw 2 pt 2  ps 1.5 dt 1
# Custom 2
set style line 7 lt 1 lc rgb "#663399" lw 2 pt 3  ps 1.5 dt 1
# Custom 3
set style line 8 lt 1 lc rgb "#000000" lw 2 pt 1  ps 1.5 dt 1
# Natarajan
set style line 9 lt 1 lc rgb "#CC6633" lw 2 pt 4  ps 1.25 dt 1


pat1 = 'fs solid 0.5 lc rgb "#FFCC00"'
pat2 = 'fs solid 0.5 lc rgb "#00CC66"'
pat3 = 'fs solid 0.7 lc rgb "#0066CC"'
pat4 = 'fs solid 1.0 lc rgb "#993333"'
pat5 = 'fs solid 1.0 lc rgb "#333333"'
#pat5 = 'fs pattern 2 lc rgb "#0066CC"'
#pat6 = 'fs pattern 6 lc rgb "#0066CC"'
#pat5 = 'fs pattern 2 lc rgb "#000000"'
#pat6 = 'fs pattern 6 lc rgb "#000000"'

#pat1 = 'fs solid 0.0  lc rgb "#000000"'
#pat2 = 'fs solid 0.3  lc rgb "#000000"'
#pat3 = 'fs solid 0.65 lc rgb "#000000"'
#pat4 = 'fs solid 1.0  lc rgb "#000000"'
#pat5 = 'fs pattern 2  lc rgb "#000000"'
#pat6 = 'fs pattern 6  lc rgb "#000000"'
#pat7 = 'fs solid 0.5  lc rgb "#000000"'


================================================
FILE: graphs/pq-ll-enq-deq.cpp
================================================
#include <iostream>
#include <fstream>
#include <cstring>
#include "PBenchmarkQueues.hpp"
#include "pdatastructures/pqueues/POFLFLinkedListQueue.hpp"
#include "pdatastructures/pqueues/POFWFLinkedListQueue.hpp"
#include "pdatastructures/pqueues/RomLogLinkedListQueue.hpp"
#include "pdatastructures/pqueues/RomLRLinkedListQueue.hpp"
#include "pdatastructures/pqueues/PMDKLinkedListQueue.hpp"
#include "pdatastructures/pqueues/MichaelScottQueue.hpp"
#include "pdatastructures/pqueues/PMichaelScottQueue.hpp"
#include "pdatastructures/pqueues/PFriedmanQueue.hpp"


int main(void) {
    const std::string dataFilename {"data/pq-ll-enq-deq.txt"};
    vector<int> threadList = { 1, 2, 4, 8, 16, 32, 48, 64 };     // For the laptop or AWS c5.2xlarge
    const int numPairs = 100*1000*1000;                          // Number of pairs of items to enqueue-dequeue. 100M for the paper
    const int numRuns = 1;                                       // 5 runs for the paper
    const int EMAX_CLASS = 10;
    uint64_t results[EMAX_CLASS][threadList.size()];
    std::string cNames[EMAX_CLASS];
    int maxClass = 0;
    // Reset results
    std::memset(results, 0, sizeof(uint64_t)*EMAX_CLASS*threadList.size());
    std::cout << "If you use PMDK, don't forget to set 'export PMEM_IS_PMEM_FORCE=1'\n";

    for (unsigned it = 0; it < threadList.size(); it++) {
        auto nThreads = threadList[it];
        int ic = 0;
        PBenchmarkQueues bench(nThreads);
        std::cout << "\n----- Persistent Queues (Linked-Lists)   numPairs=" << numPairs << "   threads=" << nThreads << "   runs=" << numRuns << " -----\n";
        results[ic][it] = bench.enqDeq<POFLFLinkedListQueue<uint64_t>,poflf::OneFileLF>       (cNames[ic], numPairs, numRuns);
        ic++;
        results[ic][it] = bench.enqDeq<POFWFLinkedListQueue<uint64_t>,pofwf::OneFileWF>       (cNames[ic], numPairs, numRuns);
        ic++;
        results[ic][it] = bench.enqDeq<RomLogLinkedListQueue<uint64_t>,romuluslog::RomulusLog>(cNames[ic], numPairs, numRuns);
        ic++;
        results[ic][it] = bench.enqDeq<RomLRLinkedListQueue<uint64_t>,romuluslr::RomulusLR>   (cNames[ic], numPairs, numRuns);
        ic++;
        results[ic][it] = bench.enqDeq<PMDKLinkedListQueue<uint64_t>,pmdk::PMDKTM>            (cNames[ic], numPairs, numRuns);
        ic++;
        // We have to use a lot less pairs for the Friedman Queue because it doesn't do memory reclamation and fills up the NVM pool too fast
        results[ic][it] = bench.enqDeqNoTransaction<PFriedmanQueue<uint64_t>>                 (cNames[ic], numPairs, numRuns);
        ic++;
        //results[ic][it] = bench.enqDeqNoTransaction<PMichaelScottQueue<uint64_t>>             (cNames[ic], numPairs, numRuns);
        //ic++;
        //results[ic][it] = bench.enqDeqNoTransaction<MichaelScottQueue<uint64_t>>              (cNames[ic], numPairs, numRuns);
        //ic++;
        // TODO: Add memory reclamation to Michal's queue... use Andreia's technique, or just fill up the pool
        maxClass = ic;
    }

    // Export tab-separated values to a file to be imported in gnuplot or excel
    ofstream dataFile;
    dataFile.open(dataFilename);
    dataFile << "Threads\t";
    // Printf class names
    for (int ic = 0; ic < maxClass; ic++) dataFile << cNames[ic] << "\t";
    dataFile << "\n";
    for (int it = 0; it < threadList.size(); it++) {
        dataFile << threadList[it] << "\t";
        for (int ic = 0; ic < maxClass; ic++) dataFile << results[ic][it] << "\t";
        dataFile << "\n";
    }
    dataFile.close();
    std::cout << "\nSuccessfuly saved results in " << dataFilename << "\n";

    return 0;
}


================================================
FILE: graphs/pread-while-writing.cpp
================================================
#include <iostream>
#include <fstream>
#include <cstring>

#include "pdatastructures/TMRedBlackTree.hpp"
#include "pdatastructures/TMRedBlackTreeByRef.hpp"
#ifdef USE_ROMLOG
#include "ptms/romuluslog/RomulusLog.hpp"
#define DATA_FILE "data/pread-while-writing-romlog.txt"
#elif defined USE_ROMLR
#include "ptms/romuluslr/RomulusLR.hpp"
#define DATA_FILE "data/pread-while-writing-romlr.txt"
#elif defined USE_OFLF
#include "ptms/ponefilelf/OneFilePTMLF.hpp"
#define DATA_FILE "data/pread-while-writing-oflf.txt"
#elif defined USE_OFWF
#include "ptms/ponefilewf/OneFilePTMWF.hpp"
#define DATA_FILE "data/pread-while-writing-ofwf.txt"
#elif defined USE_PMDK
#include "ptms/pmdk/PMDKTM.hpp"
#define DATA_FILE "data/pread-while-writing-pmdk.txt"
#endif
#include "benchmarks/PBenchmarkSets.hpp"


int main(void) {
    const std::string dataFilename { DATA_FILE };
    vector<int> threadList = { 32, 64 }; // For the laptop or AWS c5.9xlarge
    vector<int> ratioList = { 1000 };        // Permil ratio: 100%, 50%, 10%, 1%, 0.1%, 0%
    const int numElements = 1000*1000;                           // Number of keys in the set
    const int numRuns = 1;                                       // 5 runs for the paper
    const seconds testLength = 20s;                              // 20s for the paper
    const int EMAX_CLASS = 10;
    uint64_t results[EMAX_CLASS][threadList.size()][ratioList.size()];
    std::string cNames[EMAX_CLASS];
    int maxClass = 0;
    // Reset results
    std::memset(results, 0, sizeof(uint64_t)*EMAX_CLASS*threadList.size()*ratioList.size());

    double totalHours = (double)ratioList.size()*threadList.size()*testLength.count()*numRuns/(60.*60.);
    std::cout << "This benchmark is going to take " << totalHours << " hours to complete\n";
    std::cout << "If you use PMDK, don't forget to set 'export PMEM_IS_PMEM_FORCE=1'\n";

    PBenchmarkSets<uint64_t> bench;
    for (unsigned ir = 0; ir < ratioList.size(); ir++) {
        auto ratio = ratioList[ir];
        for (unsigned it = 0; it < threadList.size(); it++) {
            auto nThreads = threadList[it];
            int ic = 0;
            std::cout << "\n----- Persistent Sets (Red-Black Tree)   numElements=" << numElements << "   ratio=" << ratio/10. << "%   threads=" << nThreads << "   runs=" << numRuns << "   length=" << testLength.count() << "s -----\n";
#ifdef USE_ROMLOG
            results[ic][it][ir] = bench.benchmark<TMRedBlackTreeByRef<uint64_t,uint64_t,romuluslog::RomulusLog,romuluslog::persist>,  romuluslog::RomulusLog> (cNames[ic], nThreads, ratio, testLength, numRuns, numElements, true);
            ic++;
#elif defined USE_ROMLR
            results[ic][it][ir] = bench.benchmark<TMRedBlackTreeByRef<uint64_t,uint64_t,romuluslr::RomulusLR,romuluslr::persist>,    romuluslr::RomulusLR>    (cNames[ic], nThreads, ratio, testLength, numRuns, numElements, true);
            ic++;
#elif defined USE_OFLF
            results[ic][it][ir] = bench.benchmark<TMRedBlackTree<uint64_t,uint64_t,onefileptmlf::OneFileLF,onefileptmlf::tmtype>,    onefileptmlf::OneFileLF> (cNames[ic], nThreads, ratio, testLength, numRuns, numElements, true);
            ic++;
#elif defined USE_OFWF
            results[ic][it][ir] = bench.benchmark<TMRedBlackTree<uint64_t,uint64_t,onefileptmwf::OneFileWF,onefileptmwf::tmtype>,    onefileptmwf::OneFileWF> (cNames[ic], nThreads, ratio, testLength, numRuns, numElements, true);
            ic++;
#elif defined USE_PMDK
            results[ic][it][ir] = bench.benchmark<TMRedBlackTreeByRef<uint64_t,uint64_t,pmdk::PMDKTM,pmdk::persist>,                 pmdk::PMDKTM>            (cNames[ic], nThreads, ratio, testLength, numRuns, numElements, true);
            ic++;
#endif
            maxClass = ic;
        }
    }

    // Export tab-separated values to a file to be imported in gnuplot or excel
    ofstream dataFile;
    dataFile.open(dataFilename);
    dataFile << "Threads\t";
    // Printf class names and ratios for each column
    for (unsigned ir = 0; ir < ratioList.size(); ir++) {
        auto ratio = ratioList[ir];
        for (int ic = 0; ic < maxClass; ic++) dataFile << cNames[ic] << "-" << ratio/10. << "%"<< "\t";
    }
    dataFile << "\n";
    for (int it = 0; it < threadList.size(); it++) {
        dataFile << threadList[it] << "\t";
        for (unsigned ir = 0; ir < ratioList.size(); ir++) {
            for (int ic = 0; ic < maxClass; ic++) dataFile << results[ic][it][ir] << "\t";
        }
        dataFile << "\n";
    }
    dataFile.close();
    std::cout << "\nSuccessfuly saved results in " << dataFilename << "\n";

    return 0;
}


================================================
FILE: graphs/pset-hash-1k.cpp
================================================
#include <iostream>
#include <fstream>
#include <cstring>
#include "PBenchmarkSets.hpp"
#include "pdatastructures/TMHashMap.hpp"
#include "pdatastructures/TMHashMapByRef.hpp"
//#include "ptms/romuluslog/RomulusLog.hpp"
//#include "ptms/romuluslr/RomulusLR.hpp"
#include "ptms/PMDKTM.hpp"
#include "ptms/OneFilePTMLF.hpp"
#include "ptms/OneFilePTMWF.hpp"


int main(void) {
    const std::string dataFilename {"data/pset-hash-1k.txt"};
    vector<int> threadList = { 1, 2, 4, 8, 16, 32, 48, 64 };     // For the laptop or AWS c5.9xlarge
    vector<int> ratioList = { 1000, 500, 100, 10, 1, 0 };        // Permil ratio: 100%, 50%, 10%, 1%, 0.1%, 0%
    const int numElements = 1000;                                // Number of keys in the set
    const int numRuns = 1;                                       // 5 runs for the paper
    const seconds testLength = 20s;                              // 20s for the paper
    const int EMAX_CLASS = 10;
    uint64_t results[EMAX_CLASS][threadList.size()][ratioList.size()];
    std::string cNames[EMAX_CLASS];
    int maxClass = 0;
    // Reset results
    std::memset(results, 0, sizeof(uint64_t)*EMAX_CLASS*threadList.size()*ratioList.size());

    double totalHours = (double)EMAX_CLASS*ratioList.size()*threadList.size()*testLength.count()*numRuns/(60.*60.);
    std::cout << "This benchmark is going to take at most " << totalHours << " hours to complete\n";
    std::cout << "If you use PMDK, don't forget to set 'export PMEM_IS_PMEM_FORCE=1'\n";

    PBenchmarkSets<uint64_t> bench;
    for (unsigned ir = 0; ir < ratioList.size(); ir++) {
        auto ratio = ratioList[ir];
        for (unsigned it = 0; it < threadList.size(); it++) {
            auto nThreads = threadList[it];
            int ic = 0;
            std::cout << "\n----- Persistent Hash Set (resizable)   numElements=" << numElements << "   ratio=" << ratio/10. << "%   threads=" << nThreads << "   runs=" << numRuns << "   length=" << testLength.count() << "s -----\n";
            results[ic][it][ir] = bench.benchmark<TMHashMap<uint64_t,uint64_t,poflf::OneFileLF,poflf::tmtype>,                    poflf::OneFileLF>       (cNames[ic], nThreads, ratio, testLength, numRuns, numElements, false);
            ic++;
            results[ic][it][ir] = bench.benchmark<TMHashMap<uint64_t,uint64_t,pofwf::OneFileWF,pofwf::tmtype>,                    pofwf::OneFileWF>       (cNames[ic], nThreads, ratio, testLength, numRuns, numElements, false);
            ic++;
            //results[ic][it][ir] = bench.benchmark<TMHashMapByRef<uint64_t,uint64_t,romuluslog::RomulusLog,romuluslog::persist>,   romuluslog::RomulusLog> (cNames[ic], nThreads, ratio, testLength, numRuns, numElements, false);
            //ic++;
            //results[ic][it][ir] = bench.benchmark<TMHashMapByRef<uint64_t,uint64_t,romuluslr::RomulusLR,romuluslr::persist>,      romuluslr::RomulusLR>   (cNames[ic], nThreads, ratio, testLength, numRuns, numElements, false);
            //ic++;
            results[ic][it][ir] = bench.benchmark<TMHashMapByRef<uint64_t,uint64_t,pmdk::PMDKTM,pmdk::persist>,                   pmdk::PMDKTM>           (cNames[ic], nThreads, ratio, testLength, numRuns, numElements, false);
            ic++;
            maxClass = ic;
        }
    }

    // Export tab-separated values to a file to be imported in gnuplot or excel
    ofstream dataFile;
    dataFile.open(dataFilename);
    dataFile << "Threads\t";
    // Printf class names and ratios for each column
    for (unsigned ir = 0; ir < ratioList.size(); ir++) {
        auto ratio = ratioList[ir];
        for (int ic = 0; ic < maxClass; ic++) dataFile << cNames[ic] << "-" << ratio/10. << "%"<< "\t";
    }
    dataFile << "\n";
    for (int it = 0; it < threadList.size(); it++) {
        dataFile << threadList[it] << "\t";
        for (unsigned ir = 0; ir < ratioList.size(); ir++) {
            for (int ic = 0; ic < maxClass; ic++) dataFile << results[ic][it][ir] << "\t";
        }
        dataFile << "\n";
    }
    dataFile.close();
    std::cout << "\nSuccessfuly saved results in " << dataFilename << "\n";

    return 0;
}


================================================
FILE: graphs/pset-ll-10k.cpp
================================================
#include <iostream>
#include <fstream>
#include <cstring>
#include "PBenchmarkSets.hpp"
#include "pdatastructures/TMLinkedListSet.hpp"
//#include "ptms/romuluslog/RomulusLog.hpp"
//#include "ptms/romuluslr/RomulusLR.hpp"
#include "ptms/PMDKTM.hpp"
#include "ptms/OneFilePTMLF.hpp"
#include "ptms/OneFilePTMWF.hpp"


int main(void) {
    const std::string dataFilename {"data/pset-ll-10k.txt"};
    vector<int> threadList = { 1, 2, 4, 8, 16, 32, 48, 64 };     // For the laptop or AWS c5.9xlarge
    vector<int> ratioList = { 1000, 500, 100, 10, 1, 0 };        // Permil ratio: 100%, 50%, 10%, 1%, 0.1%, 0%
    const int numElements = 10000;                               // Number of keys in the set
    const int numRuns = 1;                                       // 5 runs for the paper
    const seconds testLength = 20s;                              // 20s for the paper
    const int EMAX_CLASS = 10;
    uint64_t results[EMAX_CLASS][threadList.size()][ratioList.size()];
    std::string cNames[EMAX_CLASS];
    int maxClass = 0;
    // Reset results
    std::memset(results, 0, sizeof(uint64_t)*EMAX_CLASS*threadList.size()*ratioList.size());

    double totalHours = (double)EMAX_CLASS*ratioList.size()*threadList.size()*testLength.count()*numRuns/(60.*60.);
    std::cout << "This benchmark is going to take at most " << totalHours << " hours to complete\n";
    std::cout << "If you use PMDK, don't forget to set 'export PMEM_IS_PMEM_FORCE=1'\n";

    PBenchmarkSets<uint64_t> bench;
    for (unsigned ir = 0; ir < ratioList.size(); ir++) {
        auto ratio = ratioList[ir];
        for (unsigned it = 0; it < threadList.size(); it++) {
            auto nThreads = threadList[it];
            int ic = 0;
            std::cout << "\n----- Persistent Sets (Linked-Lists)   numElements=" << numElements << "   ratio=" << ratio/10. << "%   threads=" << nThreads << "   runs=" << numRuns << "   length=" << testLength.count() << "s -----\n";
            results[ic][it][ir] = bench.benchmark<TMLinkedListSet<uint64_t,poflf::OneFileLF,poflf::tmtype>,               poflf::OneFileLF>       (cNames[ic], nThreads, ratio, testLength, numRuns, numElements, false);
            ic++;
            results[ic][it][ir] = bench.benchmark<TMLinkedListSet<uint64_t,pofwf::OneFileWF,pofwf::tmtype>,               pofwf::OneFileWF>       (cNames[ic], nThreads, ratio, testLength, numRuns, numElements, false);
            ic++;
            //results[ic][it][ir] = bench.benchmark<TMLinkedListSet<uint64_t,romuluslog::RomulusLog,romuluslog::persist>,   romuluslog::RomulusLog> (cNames[ic], nThreads, ratio, testLength, numRuns, numElements, false);
            //ic++;
            //results[ic][it][ir] = bench.benchmark<TMLinkedListSet<uint64_t,romuluslr::RomulusLR,romuluslr::persist>,      romuluslr::RomulusLR>   (cNames[ic], nThreads, ratio, testLength, numRuns, numElements, false);
            //ic++;
            results[ic][it][ir] = bench.benchmark<TMLinkedListSet<uint64_t,pmdk::PMDKTM,pmdk::persist>,                   pmdk::PMDKTM>           (cNames[ic], nThreads, ratio, testLength, numRuns, numElements, false);
            ic++;
            maxClass = ic;
        }
    }

    // Export tab-separated values to a file to be imported in gnuplot or excel
    ofstream dataFile;
    dataFile.open(dataFilename);
    dataFile << "Threads\t";
    // Printf class names and ratios for each column
    for (unsigned iratio = 0; iratio < ratioList.size(); iratio++) {
        auto ratio = ratioList[iratio];
        for (int ic = 0; ic < maxClass; ic++) dataFile << cNames[ic] << "-" << ratio/10. << "%"<< "\t";
    }
    dataFile << "\n";
    for (int it = 0; it < threadList.size(); it++) {
        dataFile << threadList[it] << "\t";
        for (unsigned ir = 0; ir < ratioList.size(); ir++) {
            for (int ic = 0; ic < maxClass; ic++) dataFile << results[ic][it][ir] << "\t";
        }
        dataFile << "\n";
    }
    dataFile.close();
    std::cout << "\nSuccessfuly saved results in " << dataFilename << "\n";

    return 0;
}


================================================
FILE: graphs/pset-ll-1k.cpp
================================================
#include <iostream>
#include <fstream>
#include <cstring>
#include "PBenchmarkSets.hpp"
#include "pdatastructures/TMLinkedListSetByRef.hpp"
#include "pdatastructures/TMLinkedListSet.hpp"
//#include "ptms/romuluslog/RomulusLog.hpp"
//#include "ptms/romuluslr/RomulusLR.hpp"
#include "ptms/PMDKTM.hpp"
#include "ptms/OneFilePTMLF.hpp"
#include "ptms/OneFilePTMWF.hpp"


int main(void) {
    const std::string dataFilename {"data/pset-ll-1k.txt"};
    vector<int> threadList = { 1, 2, 4, 8, 16, 32, 48, 64 };     // For the laptop or AWS c5.9xlarge
    vector<int> ratioList = { 1000, 500, 100, 10, 1, 0 };        // Permil ratio: 100%, 50%, 10%, 1%, 0.1%, 0%
    const int numElements = 1000;                                // Number of keys in the set
    const int numRuns = 1;                                       // 5 runs for the paper
    const seconds testLength = 20s;                              // 20s for the paper
    const int EMAX_CLASS = 10;
    uint64_t results[EMAX_CLASS][threadList.size()][ratioList.size()];
    std::string cNames[EMAX_CLASS];
    int maxClass = 0;
    // Reset results
    std::memset(results, 0, sizeof(uint64_t)*EMAX_CLASS*threadList.size()*ratioList.size());

    double totalHours = (double)EMAX_CLASS*ratioList.size()*threadList.size()*testLength.count()*numRuns/(60.*60.);
    std::cout << "This benchmark is going to take at most " << totalHours << " hours to complete\n";
    std::cout << "If you use PMDK, don't forget to set 'export PMEM_IS_PMEM_FORCE=1'\n";

    PBenchmarkSets<uint64_t> bench;
    for (unsigned ir = 0; ir < ratioList.size(); ir++) {
        auto ratio = ratioList[ir];
        for (unsigned it = 0; it < threadList.size(); it++) {
            auto nThreads = threadList[it];
            int ic = 0;
            std::cout << "\n----- Persistent Sets (Linked-Lists)   numElements=" << numElements << "   ratio=" << ratio/10. << "%   threads=" << nThreads << "   runs=" << numRuns << "   length=" << testLength.count() << "s -----\n";
            results[ic][it][ir] = bench.benchmark<TMLinkedListSet<uint64_t,poflf::OneFileLF,poflf::tmtype>,                    poflf::OneFileLF>       (cNames[ic], nThreads, ratio, testLength, numRuns, numElements, false);
            ic++;
            results[ic][it][ir] = bench.benchmark<TMLinkedListSet<uint64_t,pofwf::OneFileWF,pofwf::tmtype>,                    pofwf::OneFileWF>       (cNames[ic], nThreads, ratio, testLength, numRuns, numElements, false);
            ic++;
            //results[ic][it][ir] = bench.benchmark<TMLinkedListSetByRef<uint64_t,romuluslog::RomulusLog,romuluslog::persist>,   romuluslog::RomulusLog> (cNames[ic], nThreads, ratio, testLength, numRuns, numElements, false);
            //ic++;
            //results[ic][it][ir] = bench.benchmark<TMLinkedListSetByRef<uint64_t,romuluslr::RomulusLR,romuluslr::persist>,      romuluslr::RomulusLR>   (cNames[ic], nThreads, ratio, testLength, numRuns, numElements, false);
            //ic++;
            results[ic][it][ir] = bench.benchmark<TMLinkedListSetByRef<uint64_t,pmdk::PMDKTM,pmdk::persist>,                   pmdk::PMDKTM>           (cNames[ic], nThreads, ratio, testLength, numRuns, numElements, false);
            ic++;
            maxClass = ic;
        }
    }

    // Export tab-separated values to a file to be imported in gnuplot or excel
    ofstream dataFile;
    dataFile.open(dataFilename);
    dataFile << "Threads\t";
    // Printf class names and ratios for each column
    for (unsigned ir = 0; ir < ratioList.size(); ir++) {
        auto ratio = ratioList[ir];
        for (int ic = 0; ic < maxClass; ic++) dataFile << cNames[ic] << "-" << ratio/10. << "%"<< "\t";
    }
    dataFile << "\n";
    for (int it = 0; it < threadList.size(); it++) {
        dataFile << threadList[it] << "\t";
        for (unsigned ir = 0; ir < ratioList.size(); ir++) {
            for (int ic = 0; ic < maxClass; ic++) dataFile << results[ic][it][ir] << "\t";
        }
        dataFile << "\n";
    }
    dataFile.close();
    std::cout << "\nSuccessfuly saved results in " << dataFilename << "\n";

    return 0;
}


================================================
FILE: graphs/pset-tree-1k.cpp
================================================
#include <iostream>
#include <fstream>
#include <cstring>

#include "PBenchmarkSets.hpp"
#include "pdatastructures/TMRedBlackTree.hpp"
#include "pdatastructures/TMRedBlackTreeByRef.hpp"
#include "ptms/romuluslog/RomulusLog.hpp"
#include "ptms/romuluslr/RomulusLR.hpp"
#include "ptms/PMDKTM.hpp"
#include "ptms/OneFilePTMLF.hpp"
#include "ptms/OneFilePTMWF.hpp"


int main(void) {
    const std::string dataFilename {"data/pset-tree-1k.txt"};
    vector<int> threadList = { 1, 2, 4, 8, 16, 32, 48, 64 };     // For the laptop or AWS c5.9xlarge
    vector<int> ratioList = { 1000, 500, 100, 10, 1, 0 };        // Permil ratio: 100%, 50%, 10%, 1%, 0.1%, 0%
    const int numElements = 1000;                                // Number of keys in the set
    const int numRuns = 1;                                       // 5 runs for the paper
    const seconds testLength = 20s;                              // 20s for the paper
    const int EMAX_CLASS = 10;
    uint64_t results[EMAX_CLASS][threadList.size()][ratioList.size()];
    std::string cNames[EMAX_CLASS];
    int maxClass = 0;
    // Reset results
    std::memset(results, 0, sizeof(uint64_t)*EMAX_CLASS*threadList.size()*ratioList.size());

    double totalHours = (double)EMAX_CLASS*ratioList.size()*threadList.size()*testLength.count()*numRuns/(60.*60.);
    std::cout << "This benchmark is going to take at most " << totalHours << " hours to complete\n";
    std::cout << "If you use PMDK, don't forget to set 'export PMEM_IS_PMEM_FORCE=1'\n";

    PBenchmarkSets<uint64_t> bench;
    for (unsigned ir = 0; ir < ratioList.size(); ir++) {
        auto ratio = ratioList[ir];
        for (unsigned it = 0; it < threadList.size(); it++) {
            auto nThreads = threadList[it];
            int ic = 0;
            std::cout << "\n----- Persistent Sets (Red-Black Tree)   numElements=" << numElements << "   ratio=" << ratio/10. << "%   threads=" << nThreads << "   runs=" << numRuns << "   length=" << testLength.count() << "s -----\n";
            results[ic][it][ir] = bench.benchmark<TMRedBlackTree<uint64_t,uint64_t,poflf::OneFileLF,poflf::tmtype>,                    poflf::OneFileLF>       (cNames[ic], nThreads, ratio, testLength, numRuns, numElements, false);
            ic++;
            results[ic][it][ir] = bench.benchmark<TMRedBlackTree<uint64_t,uint64_t,pofwf::OneFileWF,pofwf::tmtype>,                    pofwf::OneFileWF>       (cNames[ic], nThreads, ratio, testLength, numRuns, numElements, false);
            ic++;
            results[ic][it][ir] = bench.benchmark<TMRedBlackTreeByRef<uint64_t,uint64_t,romuluslog::RomulusLog,romuluslog::persist>,   romuluslog::RomulusLog> (cNames[ic], nThreads, ratio, testLength, numRuns, numElements, false);
            ic++;
            results[ic][it][ir] = bench.benchmark<TMRedBlackTreeByRef<uint64_t,uint64_t,romuluslr::RomulusLR,romuluslr::persist>,      romuluslr::RomulusLR>   (cNames[ic], nThreads, ratio, testLength, numRuns, numElements, false);
            ic++;
            results[ic][it][ir] = bench.benchmark<TMRedBlackTreeByRef<uint64_t,uint64_t,pmdk::PMDKTM,pmdk::persist>,                   pmdk::PMDKTM>           (cNames[ic], nThreads, ratio, testLength, numRuns, numElements, false);
            ic++;
            maxClass = ic;
        }
    }

    // Export tab-separated values to a file to be imported in gnuplot or excel
    ofstream dataFile;
    dataFile.open(dataFilename);
    dataFile << "Threads\t";
    // Printf class names and ratios for each column
    for (unsigned ir = 0; ir < ratioList.size(); ir++) {
        auto ratio = ratioList[ir];
        for (int ic = 0; ic < maxClass; ic++) dataFile << cNames[ic] << "-" << ratio/10. << "%"<< "\t";
    }
    dataFile << "\n";
    for (int it = 0; it < threadList.size(); it++) {
        dataFile << threadList[it] << "\t";
        for (unsigned ir = 0; ir < ratioList.size(); ir++) {
            for (int ic = 0; ic < maxClass; ic++) dataFile << results[ic][it][ir] << "\t";
        }
        dataFile << "\n";
    }
    dataFile.close();
    std::cout << "\nSuccessfuly saved results in " << dataFilename << "\n";

    return 0;
}


================================================
FILE: graphs/pset-tree-1m.cpp
================================================
#include <iostream>
#include <fstream>
#include <cstring>

#include "pdatastructures/TMRedBlackTree.hpp"
#include "pdatastructures/TMRedBlackTreeByRef.hpp"
#ifdef USE_ROMLOG
#include "ptms/romuluslog/RomulusLog.hpp"
#define DATA_FILE "data/pset-tree-1m-romlog.txt"
#elif defined USE_ROMLR
#include "ptms/romuluslr/RomulusLR.hpp"
#define DATA_FILE "data/pset-tree-1m-romlr.txt"
#elif defined USE_OFLF
#include "ptms/OneFilePTMLF.hpp"
#define DATA_FILE "data/pset-tree-1m-oflf.txt"
#elif defined USE_OFWF
#include "ptms/OneFilePTMWF.hpp"
#define DATA_FILE "data/pset-tree-1m-ofwf.txt"
#elif defined USE_PMDK
#include "ptms/PMDKTM.hpp"
#define DATA_FILE "data/pset-tree-1m-pmdk.txt"
#endif
#include "PBenchmarkSets.hpp"


int main(void) {
    const std::string dataFilename { DATA_FILE };
    vector<int> threadList = { 1, 2, 4, 8, 16, 24, 32, 48, 64 }; // For the laptop or AWS c5.9xlarge
    vector<int> ratioList = { 1000, 500, 100, 10, 1, 0 };        // Permil ratio: 100%, 50%, 10%, 1%, 0.1%, 0%
    const int numElements = 1000*1000;                           // Number of keys in the set
    const int numRuns = 1;                                       // 5 runs for the paper
    const seconds testLength = 20s;                              // 20s for the paper
    const int EMAX_CLASS = 10;
    uint64_t results[EMAX_CLASS][threadList.size()][ratioList.size()];
    std::string cNames[EMAX_CLASS];
    int maxClass = 0;
    // Reset results
    std::memset(results, 0, sizeof(uint64_t)*EMAX_CLASS*threadList.size()*ratioList.size());

    double totalHours = (double)ratioList.size()*threadList.size()*testLength.count()*numRuns/(60.*60.);
    std::cout << "This benchmark is going to take " << totalHours << " hours to complete\n";
    std::cout << "If you use PMDK, don't forget to set 'export PMEM_IS_PMEM_FORCE=1'\n";

    PBenchmarkSets<uint64_t> bench;
    for (unsigned ir = 0; ir < ratioList.size(); ir++) {
        auto ratio = ratioList[ir];
        for (unsigned it = 0; it < threadList.size(); it++) {
            auto nThreads = threadList[it];
            int ic = 0;
            std::cout << "\n----- Persistent Sets (Red-Black Tree)   numElements=" << numElements << "   ratio=" << ratio/10. << "%   threads=" << nThreads << "   runs=" << numRuns << "   length=" << testLength.count() << "s -----\n";
#ifdef USE_ROMLOG
            results[ic][it][ir] = bench.benchmark<TMRedBlackTreeByRef<uint64_t,uint64_t,romuluslog::RomulusLog,romuluslog::persist>,  romuluslog::RomulusLog> (cNames[ic], nThreads, ratio, testLength, numRuns, numElements, false);
            ic++;
#elif defined USE_ROMLR
            results[ic][it][ir] = bench.benchmark<TMRedBlackTreeByRef<uint64_t,uint64_t,romuluslr::RomulusLR,romuluslr::persist>,    romuluslr::RomulusLR>    (cNames[ic], nThreads, ratio, testLength, numRuns, numElements, false);
            ic++;
#elif defined USE_OFLF
            results[ic][it][ir] = bench.benchmark<TMRedBlackTree<uint64_t,uint64_t,poflf::OneFileLF,poflf::tmtype>,                  poflf::OneFileLF>        (cNames[ic], nThreads, ratio, testLength, numRuns, numElements, false);
            ic++;
#elif defined USE_OFWF
            results[ic][it][ir] = bench.benchmark<TMRedBlackTree<uint64_t,uint64_t,pofwf::OneFileWF,pofwf::tmtype>,                  pofwf::OneFileWF>        (cNames[ic], nThreads, ratio, testLength, numRuns, numElements, false);
            ic++;
#elif defined USE_PMDK
            results[ic][it][ir] = bench.benchmark<TMRedBlackTreeByRef<uint64_t,uint64_t,pmdk::PMDKTM,pmdk::persist>,                 pmdk::PMDKTM>            (cNames[ic], nThreads, ratio, testLength, numRuns, numElements, false);
            ic++;
#endif
            maxClass = ic;
        }
    }

    // Export tab-separated values to a file to be imported in gnuplot or excel
    ofstream dataFile;
    dataFile.open(dataFilename);
    dataFile << "Threads\t";
    // Printf class names and ratios for each column
    for (unsigned ir = 0; ir < ratioList.size(); ir++) {
        auto ratio = ratioList[ir];
        for (int ic = 0; ic < maxClass; ic++) dataFile << cNames[ic] << "-" << ratio/10. << "%"<< "\t";
    }
    dataFile << "\n";
    for (int it = 0; it < threadList.size(); it++) {
        dataFile << threadList[it] << "\t";
        for (unsigned ir = 0; ir < ratioList.size(); ir++) {
            for (int ic = 0; ic < maxClass; ic++) dataFile << results[ic][it][ir] << "\t";
        }
        dataFile << "\n";
    }
    dataFile.close();
    std::cout << "\nSuccessfuly saved results in " << dataFilename << "\n";

    return 0;
}


================================================
FILE: graphs/psps-integer.cpp
================================================
/*
 * This benchmark executes SPS for the following PTMs:
 * - RomulusLog
 * - RomulusLR
 * - PMDK
 * - OneFilePTM-LF (lock-free)
 * - OneFilePTM-WF (wait-free bounded)
 */
#include <iostream>
#include <fstream>
#include <cstring>

#include "PBenchmarkSPS.hpp"
#include "ptms/romuluslog/RomulusLog.hpp"
#include "ptms/romuluslr/RomulusLR.hpp"
#include "ptms/PMDKTM.hpp"
#include "ptms/OneFilePTMLF.hpp"
#include "ptms/OneFilePTMWF.hpp"


int main(void) {
    const std::string dataFilename {"data/psps-integer.txt"};
    vector<int> threadList = { 1, 2, 4, 8, 16, 32, 48, 64 }; // For the laptop or AWS c5.9xlarge
    vector<long> swapsPerTxList = { 1, 4, 8, 16, 32, 64, 128, 256 };
    const int numRuns = 1;                                   // 5 runs for the paper
    const seconds testLength = 20s;                          // 20s for the paper
    const int EMAX_CLASS = 10;
    int maxClass = 0;
    uint64_t results[EMAX_CLASS][threadList.size()][swapsPerTxList.size()];
    std::string cNames[EMAX_CLASS];
    // Reset results
    std::memset(results, 0, sizeof(uint64_t)*EMAX_CLASS*threadList.size()*swapsPerTxList.size());

    // SPS Benchmarks multi-threaded
    std::cout << "If you use PMDK, don't forget to set 'export PMEM_IS_PMEM_FORCE=1'\n";
    std::cout << "\n----- Persistent SPS Benchmark (multi-threaded integer array swap) -----\n";
    for (int it = 0; it < threadList.size(); it++) {
        int nThreads = threadList[it];
        for (int is = 0; is < swapsPerTxList.size(); is++) {
            int nWords = swapsPerTxList[is];
            int ic = 0;
            PBenchmarkSPS bench(nThreads);
            std::cout << "\n----- threads=" << nThreads << "   runs=" << numRuns << "   length=" << testLength.count() << "s   arraySize=" << arraySize << "   swaps/tx=" << nWords << " -----\n";
            results[ic][it][is] = bench.benchmarkSPSInteger<poflf::OneFileLF,        poflf::tmtype>        (cNames[ic], testLength, nWords, numRuns);
            ic++;
            results[ic][it][is] = bench.benchmarkSPSInteger<pofwf::OneFileWF,        pofwf::tmtype>        (cNames[ic], testLength, nWords, numRuns);
            ic++;
            results[ic][it][is] = bench.benchmarkSPSInteger<romuluslog::RomulusLog,  romuluslog::persist>  (cNames[ic], testLength, nWords, numRuns);
            ic++;
            results[ic][it][is] = bench.benchmarkSPSInteger<romuluslr::RomulusLR,    romuluslr::persist>   (cNames[ic], testLength, nWords, numRuns);
            ic++;
            results[ic][it][is] = bench.benchmarkSPSInteger<pmdk::PMDKTM,            pmdk::persist>        (cNames[ic], testLength, nWords, numRuns);
            ic++;
            maxClass = ic;
        }
        std::cout << "\n";
    }

    // Export tab-separated values to a file to be imported in gnuplot or excel
    ofstream dataFile;
    dataFile.open(dataFilename);
    dataFile << "Swaps\t";
    // Printf class names for each column plus the corresponding thread
    for (int ic = 0; ic < maxClass; ic++) {
        for (int it = 0; it < threadList.size(); it++) {
            int nThreads = threadList[it];
            dataFile << cNames[ic] << "-" << nThreads <<"T\t";
        }
    }
    dataFile << "\n";
    for (int is = 0; is < swapsPerTxList.size(); is++) {
        dataFile << swapsPerTxList[is] << "\t";
        for (int ic = 0; ic < maxClass; ic++) {
            for (int it = 0; it < threadList.size(); it++) {
                dataFile << results[ic][it][is] << "\t";
            }
        }
        dataFile << "\n";
    }
    dataFile.close();
    std::cout << "\nSuccessfuly saved results in " << dataFilename << "\n";


    return 0;
}


================================================
FILE: graphs/q-array-enq-deq.cpp
================================================
/*
 * Executes the following non-blocking (array based) queues in a single-enqueue-single-dequeue benchmark:
 * - LCRQ (lock-free)
 * - FAAArrayQueue (lock-free)
 * - MWC-LF (lock-free)
 * - MWC-WF (wait-free bounded)
 */
#include <iostream>
#include <fstream>
#include <cstring>
#include "BenchmarkQueues.hpp"
#include "datastructures/queues/LCRQueue.hpp"
#include "datastructures/queues/FAAArrayQueue.hpp"
#include "datastructures/queues/OFLFArrayLinkedListQueue.hpp"
#include "datastructures/queues/OFWFArrayLinkedListQueue.hpp"
#include "datastructures/queues/OFLFArrayQueue.hpp"
// Macros suck, but it's either TL2 or TinySTM or ESTM, we can't have all at the same time
#if defined USE_TL2
#include "datastructures/queues/TL2STMArrayLinkedListQueue.hpp"
#include "datastructures/queues/TL2STMArrayQueue.hpp"
#define DATA_FILENAME "data/q-array-enq-deq-tl2.txt"
#elif defined USE_TINY
#include "datastructures/queues/TinySTMArrayLinkedListQueue.hpp"
#define DATA_FILENAME "data/q-array-enq-deq-tiny.txt"
#else
#include "datastructures/queues/ESTMArrayLinkedListQueue.hpp"
#define DATA_FILENAME "data/q-array-enq-deq.txt"
#endif

#define MILLION  1000000LL

int main(void) {
    const std::string dataFilename {DATA_FILENAME};
    vector<int> threadList = { 1, 2, 4, 8, 16, 32, 48, 64 };     // For the laptop or AWS c5.9xlarge
    const int numRuns = 1;                                   // Number of runs
    const long numPairs = 100*MILLION;                       // 10M is fast enough on the laptop, but on AWS we can use 100M
    const int EMAX_CLASS = 10;
    uint64_t results[EMAX_CLASS][threadList.size()];
    std::string cNames[EMAX_CLASS];
    int maxClass = 0;
    // Reset results
    std::memset(results, 0, sizeof(uint64_t)*EMAX_CLASS*threadList.size());

    // Enq-Deq Throughput benchmarks
    for (int ithread = 0; ithread < threadList.size(); ithread++) {
        int nThreads = threadList[ithread];
        int ic = 0;
        BenchmarkQueues bench(nThreads);
        std::cout << "\n----- q-array-enq-deq   threads=" << nThreads << "   pairs=" << numPairs/MILLION << "M   runs=" << numRuns << "-----\n";
#ifdef USE_TL2
        results[ic][ithread] = bench.enqDeq<TL2STMArrayLinkedListQueue<UserData>> (cNames[ic], numPairs, numRuns);
        //results[iclass][ithread] = bench.enqDeq<TL2STMArrayQueue<UserData>> (cNames[ic], numPairs, numRuns);
        ic++;
#elif defined USE_TINY
        results[ic][ithread] = bench.enqDeq<TinySTMArrayLinkedListQueue<UserData>> (cNames[ic], numPairs, numRuns);
        ic++;
#else
        results[ic][ithread] = bench.enqDeq<OFLFArrayLinkedListQueue<UserData>>   (cNames[ic], numPairs, numRuns);
        //results[iclass][ithread] = bench.enqDeq<OFLFArrayQueue<UserData>>   (cNames[ic], numPairs, numRuns);
        ic++;
        results[ic][ithread] = bench.enqDeq<OFWFArrayLinkedListQueue<UserData>>   (cNames[ic], numPairs, numRuns);
        ic++;
        results[ic][ithread] = bench.enqDeq<ESTMArrayLinkedListQueue<UserData>>   (cNames[ic], numPairs, numRuns);
        ic++;
        results[ic][ithread] = bench.enqDeq<FAAArrayQueue<UserData>>              (cNames[ic], numPairs, numRuns);
        ic++;
        results[ic][ithread] = bench.enqDeq<LCRQueue<UserData>>                   (cNames[ic], numPairs, numRuns);
        ic++;
#endif
        maxClass = ic;
    }

    // Export tab-separated values to a file to be imported in gnuplot or excel
    ofstream dataFile;
    dataFile.open(dataFilename);
    dataFile << "Threads\t";
    // Printf class names for each column
    for (int ic = 0; ic < maxClass; ic++) dataFile << cNames[ic] << "\t";
    dataFile << "\n";
    for (int it = 0; it < threadList.size(); it++) {
        dataFile << threadList[it] << "\t";
        for (int ic = 0; ic < maxClass; ic++) dataFile << results[ic][it] << "\t";
        dataFile << "\n";
    }
    dataFile.close();
    std::cout << "\nSuccessfuly saved results in " << dataFilename << "\n";

    return 0;
}


================================================
FILE: graphs/q-ll-enq-deq.cpp
================================================
/*
 * Executes the following non-blocking (linked list based) queues in a single-enqueue-single-dequeue benchmark:
 * - Michael-Scott (lock-free)
 * - SimQueue (wait-free bounded)
 * - Turn Queue (wait-free bounded)
 * - MWC-LF (lock-free)
 * - MWC-WF (wait-free bounded)
 */
#include <iostream>
#include <fstream>
#include <cstring>
#include "BenchmarkQueues.hpp"
#include "datastructures/queues/MichaelScottQueue.hpp"
#include "datastructures/queues/SimQueue.hpp"
#include "datastructures/queues/TurnQueue.hpp"
#include "datastructures/queues/OFWFLinkedListQueue.hpp"
#include "datastructures/queues/OFLFLinkedListQueue.hpp"
// Macros suck, but it's either TL2 or TinySTM or ESTM, we can't have all at the same time
#if defined USE_TL2
#include "datastructures/queues/TL2STMLinkedListQueue.hpp"
#define DATA_FILENAME "data/q-ll-enq-deq-tl2.txt"
#elif defined USE_TINY
#include "datastructures/queues/TinySTMLinkedListQueue.hpp"
#define DATA_FILENAME "data/q-ll-enq-deq-tiny.txt"
#else
#include "datastructures/queues/ESTMLinkedListQueue.hpp"
#define DATA_FILENAME "data/q-ll-enq-deq.txt"
#endif


#define MILLION  1000000LL

int main(void) {
    const std::string dataFilename {DATA_FILENAME};
    vector<int> threadList = { 1, 2, 4, 8, 16, 32, 48, 64 };     // For the laptop or AWS c5.9xlarge
    const int numRuns = 1;                                       // Number of runs
    const long numPairs = 200*MILLION;                           // 10M is fast enough on the laptop, but on AWS we can use 100M
    const int EMAX_CLASS = 10;
    uint64_t results[EMAX_CLASS][threadList.size()];
    std::string cNames[EMAX_CLASS];
    int maxClass = 0;
    // Reset results
    std::memset(results, 0, sizeof(uint64_t)*EMAX_CLASS*threadList.size());

    // Enq-Deq Throughput benchmarks
    for (int it = 0; it < threadList.size(); it++) {
        int nThreads = threadList[it];
        int ic = 0;
        BenchmarkQueues bench(nThreads);
        std::cout << "\n----- q-ll-enq-deq   threads=" << nThreads << "   pairs=" << numPairs/MILLION << "M   runs=" << numRuns << " -----\n";
#if defined USE_TL2
        results[ic][it] = bench.enqDeq<TL2STMLinkedListQueue<UserData>> (cNames[ic], numPairs, numRuns);
        ic++;
#elif defined USE_TINY
        results[ic][it] = bench.enqDeq<TinySTMLinkedListQueue<UserData>>(cNames[ic], numPairs, numRuns);
        ic++;
#else
        results[ic][it] = bench.enqDeq<OFLFLinkedListQueue<UserData>>   (cNames[ic], numPairs, numRuns);
        ic++;
        results[ic][it] = bench.enqDeq<OFWFLinkedListQueue<UserData>>   (cNames[ic], numPairs, numRuns);
        ic++;
        results[ic][it] = bench.enqDeq<ESTMLinkedListQueue<UserData>>   (cNames[ic], numPairs, numRuns);
        ic++;
        results[ic][it] = bench.enqDeq<MichaelScottQueue<UserData>>     (cNames[ic], numPairs, numRuns);
        ic++;
        results[ic][it] = bench.enqDeq<SimQueue<UserData>>              (cNames[ic], numPairs, numRuns);
        ic++;
        results[ic][it] = bench.enqDeq<TurnQueue<UserData>>             (cNames[ic], numPairs, numRuns);
        ic++;
#endif
        maxClass = ic;
    }

    // Export tab-separated values to a file to be imported in gnuplot or excel
    ofstream dataFile;
    dataFile.open(dataFilename);
    dataFile << "Threads\t";
    // Printf class names for each column
    for (int ic = 0; ic < maxClass; ic++) dataFile << cNames[ic] << "\t";
    dataFile << "\n";
    for (int it = 0; it < threadList.size(); it++) {
        dataFile << threadList[it] << "\t";
        for (int ic = 0; ic < maxClass; ic++) dataFile << results[ic][it] << "\t";
        dataFile << "\n";
    }
    dataFile.close();
    std::cout << "\nSuccessfuly saved results in " << dataFilename << "\n";

    return 0;
}


================================================
FILE: graphs/run-all-aws.sh
================================================
echo "Run this on a C5 AWS instance (c5.9xlarge) to generate all the data files for the plots"
export PMEM_IS_PMEM_FORCE=1
make persistencyclean
bin/pq-ll-enq-deq
make persistencyclean
bin/pset-hash-1k
make persistencyclean
bin/pset-ll-1k
make persistencyclean
bin/pset-tree-1k
make persistencyclean
bin/psps-integer
make persistencyclean
bin/q-array-enq-deq
bin/q-array-enq-deq-tiny
bin/q-ll-enq-deq
bin/q-ll-enq-deq-tiny
bin/set-hash-1k
bin/set-hash-1k-tiny
bin/set-ll-1k
bin/set-ll-1k-tiny
bin/set-tree-1k
bin/set-tree-1k-tiny
bin/set-tree-10k
bin/set-tree-10k-tiny
bin/set-tree-1m
bin/set-tree-1m-tiny
bin/sps-integer
bin/sps-integer-tiny
#bin/sps-object
#bin/sps-object-tiny

make persistencyclean
bin/pset-tree-1m-romlog
make persistencyclean
bin/pset-tree-1m-romlr
make persistencyclean
bin/pset-tree-1m-oflf
make persistencyclean
bin/pset-tree-1m-ofwf
make persistencyclean
bin/pset-tree-1m-pmdk


================================================
FILE: graphs/set-hash-1k.cpp
================================================
#include <iostream>
#include <fstream>
#include <cstring>
#include "BenchmarkSets.hpp"
#include "datastructures/hashmaps/OFLFResizableHashSet.hpp"
#include "datastructures/hashmaps/OFWFResizableHashSet.hpp"
// Macros suck, but it's either TL2 or TinySTM or ESTM, we can't have all at the same time
#if defined USE_TL2
//#include "datastructures/hashmaps/TL2STMResizableHashSet.hpp"
#define DATA_FILENAME "data/set-hash-1k-tl2.txt"
#elif defined USE_TINY
#include "datastructures/hashmaps/TinySTMResizableHashSet.hpp"
#define DATA_FILENAME "data/set-hash-1k-tiny.txt"
#else
#include "datastructures/hashmaps/ESTMResizableHashSet.hpp"
#define DATA_FILENAME "data/set-hash-1k.txt"
#endif


int main(void) {
    const std::string dataFilename {DATA_FILENAME};
    vector<int> threadList = { 1, 2, 4, 8, 16, 32, 48, 64 };     // For the laptop or AWS c5.9xlarge
    vector<int> ratioList = { 1000, 500, 100, 10, 1, 0 };        // Permil ratio: 100%, 50%, 10%, 1%, 0.1%, 0%
    const int numElements = 1000;                                // Number of keys in the set
    const int numRuns = 1;                                       // 5 runs for the paper
    const seconds testLength = 20s;                              // 20s for the paper
    const int EMAX_CLASS = 10;
    uint64_t results[EMAX_CLASS][threadList.size()][ratioList.size()];
    std::string cNames[EMAX_CLASS];
    int maxClass = 0;
    // Reset results
    std::memset(results, 0, sizeof(uint64_t)*EMAX_CLASS*threadList.size()*ratioList.size());

    double totalHours = (double)EMAX_CLASS*ratioList.size()*threadList.size()*testLength.count()*numRuns/(60.*60.);
    std::cout << "This benchmark is going to take at most " << totalHours << " hours to complete\n";

    for (unsigned ir = 0; ir < ratioList.size(); ir++) {
        auto ratio = ratioList[ir];
        for (unsigned it = 0; it < threadList.size(); it++) {
            auto nThreads = threadList[it];
            int ic = 0;
            BenchmarkSets bench(nThreads);
            std::cout << "\n----- Sets (Hashtable)   numElements=" << numElements << "   ratio=" << ratio/10. << "%   threads=" << nThreads << "   runs=" << numRuns << "   length=" << testLength.count() << "s -----\n";
#if defined USE_TL2
            //results[ic][it][ir] = bench.benchmark<TL2STMResizableHashSet<uint64_t>,uint64_t>            (cNames[ic], ratio, testLength, numRuns, numElements, false);
            //ic++;
#elif defined USE_TINY
            results[ic][it][ir] = bench.benchmark<TinySTMResizableHashSet<uint64_t>,uint64_t>           (cNames[ic], ratio, testLength, numRuns, numElements, false);
            ic++;
#else
            // TODO: infinte loop bug?
            results[ic][it][ir] = bench.benchmark<OFLFResizableHashSet<uint64_t>,uint64_t>              (cNames[ic], ratio, testLength, numRuns, numElements, false);
            ic++;
            results[ic][it][ir] = bench.benchmark<OFWFResizableHashSet<uint64_t>,uint64_t>              (cNames[ic], ratio, testLength, numRuns, numElements, false);
            ic++;
            results[ic][it][ir] = bench.benchmark<ESTMResizableHashSet<uint64_t>,uint64_t>              (cNames[ic], ratio, testLength, numRuns, numElements, false);
            ic++;
            //results[ic][it][ir] = bench.benchmark<MagedHarrisLinkedListSetHP<uint64_t>,uint64_t>     (cNames[ic], ratio, testLength, numRuns, numElements, false);
            //ic++;
            //results[ic][it][ir] = bench.benchmark<MagedHarrisLinkedListSetHE<uint64_t>,uint64_t>     (cNames[ic], ratio, testLength, numRuns, numElements, false);
            //ic++;
#endif
            maxClass = ic;
        }
    }

    // Export tab-separated values to a file to be imported in gnuplot or excel
    ofstream dataFile;
    dataFile.open(dataFilename);
    dataFile << "Threads\t";
    // Printf class names and ratios for each column
    for (unsigned ir = 0; ir < ratioList.size(); ir++) {
        auto ratio = ratioList[ir];
        for (int ic = 0; ic < maxClass; ic++) dataFile << cNames[ic] << "-" << ratio/10. << "%"<< "\t";
    }
    dataFile << "\n";
    for (int it = 0; it < threadList.size(); it++) {
        dataFile << threadList[it] << "\t";
        for (unsigned ir = 0; ir < ratioList.size(); ir++) {
            for (int ic = 0; ic < maxClass; ic++) dataFile << results[ic][it][ir] << "\t";
        }
        dataFile << "\n";
    }
    dataFile.close();
    std::cout << "\nSuccessfuly saved results in " << dataFilename << "\n";

    return 0;
}


================================================
FILE: graphs/set-ll-10k.cpp
================================================
#include <iostream>
#include <fstream>
#include <cstring>
#include "BenchmarkSets.hpp"
#include "datastructures/linkedlists/MagedHarrisLinkedListSetHP.hpp"
#include "datastructures/linkedlists/MagedHarrisLinkedListSetHE.hpp"
#include "datastructures/linkedlists/OFLFLinkedListSet.hpp"
#include "datastructures/linkedlists/OFWFLinkedListSet.hpp"
// Macros suck, but it's either TL2 or TinySTM or ESTM, we can't have all at the same time
#if defined USE_TL2
#include "datastructures/linkedlists/TL2STMLinkedListSet.hpp"
#define DATA_FILENAME "data/set-ll-10k-tl2.txt"
#elif defined USE_TINY
#include "datastructures/linkedlists/TinySTMLinkedListSet.hpp"
#define DATA_FILENAME "data/set-ll-10k-tiny.txt"
#else
#include "datastructures/linkedlists/ESTMLinkedListSet.hpp"
#define DATA_FILENAME "data/set-ll-10k.txt"
#endif


int main(void) {
    const std::string dataFilename {DATA_FILENAME};
    vector<int> threadList = { 1, 2, 4, 8, 16, 32, 48, 64 };     // For the laptop or AWS c5.2xlarge
    //vector<int> threadList = { 1, 2, 4, 8, 16, 32, 48, 64, 96 }; // For Cervino
    vector<int> ratioList = { 1000, 500, 100, 10, 1, 0 };        // Permil ratio: 100%, 50%, 10%, 1%, 0.1%, 0%
    const int numElements = 10000;                               // Number of keys in the set
    const int numRuns = 1;                                       // 5 runs for the paper
    const seconds testLength = 20s;                              // 20s for the paper
    const int EMAX_CLASS = 10;
    uint64_t results[EMAX_CLASS][threadList.size()][ratioList.size()];
    std::string cNames[EMAX_CLASS];
    int maxClass = 0;
    // Reset results
    std::memset(results, 0, sizeof(uint64_t)*EMAX_CLASS*threadList.size()*ratioList.size());

    double totalHours = (double)EMAX_CLASS*ratioList.size()*threadList.size()*testLength.count()*numRuns/(60.*60.);
    std::cout << "This benchmark is going to take at most " << totalHours << " hours to complete\n";

    for (unsigned ir = 0; ir < ratioList.size(); ir++) {
        auto ratio = ratioList[ir];
        for (unsigned it = 0; it < threadList.size(); it++) {
            auto nThreads = threadList[it];
            int ic = 0;
            BenchmarkSets bench(nThreads);
            std::cout << "\n----- Sets (Linked-Lists)   numElements=" << numElements << "   ratio=" << ratio/10. << "%   threads=" << nThreads << "   runs=" << numRuns << "   length=" << testLength.count() << "s -----\n";
#if defined USE_TL2
            results[ic][it][ir] = bench.benchmark<TL2STMLinkedListSet<UserData>,UserData>            (cNames[ic], ratio, testLength, numRuns, numElements, false);
            ic++;
#elif defined USE_TINY
            results[ic][it][ir] = bench.benchmark<TinySTMLinkedListSet<UserData>,UserData>           (cNames[ic], ratio, testLength, numRuns, numElements, false);
            ic++;
#else
            results[ic][it][ir] = bench.benchmark<OFLFLinkedListSet<UserData>,UserData>              (cNames[ic], ratio, testLength, numRuns, numElements, false);
            ic++;
            results[ic][it][ir] = bench.benchmark<OFWFLinkedListSet<UserData>,UserData>              (cNames[ic], ratio, testLength, numRuns, numElements, false);
            ic++;
            results[ic][it][ir] = bench.benchmark<ESTMLinkedListSet<UserData>,UserData>              (cNames[ic], ratio, testLength, numRuns, numElements, false);
            ic++;
            results[ic][it][ir] = bench.benchmark<MagedHarrisLinkedListSetHP<UserData>,UserData>     (cNames[ic], ratio, testLength, numRuns, numElements, false);
            ic++;
            results[ic][it][ir] = bench.benchmark<MagedHarrisLinkedListSetHE<UserData>,UserData>     (cNames[ic], ratio, testLength, numRuns, numElements, false);
            ic++;
            //results[ic][it][ir] = bench.benchmark<UCSet<CXMutationWF<LinkedListSet<UserData>>,LinkedListSet<UserData>,UserData>,UserData>          (cNames[ic], ratio, testLength, numRuns, numElements, false);
            //ic++;
            //results[ic][it][ir] = bench.benchmark<UCSet<CXMutationWFTimed<LinkedListSet<UserData>>,LinkedListSet<UserData>,UserData>,UserData>     (cNames[ic], ratio, testLength, numRuns, numElements, false);
            //ic++;
#endif
            maxClass = ic;
        }
    }

    // Export tab-separated values to a file to be imported in gnuplot or excel
    ofstream dataFile;
    dataFile.open(dataFilename);
    dataFile << "Threads\t";
    // Printf class names and ratios for each column
    for (unsigned iratio = 0; iratio < ratioList.size(); iratio++) {
        auto ratio = ratioList[iratio];
        for (int iclass = 0; iclass < maxClass; iclass++) dataFile << cNames[iclass] << "-" << ratio/10. << "%"<< "\t";
    }
    dataFile << "\n";
    for (int it = 0; it < threadList.size(); it++) {
        dataFile << threadList[it] << "\t";
        for (unsigned ir = 0; ir < ratioList.size(); ir++) {
            for (int ic = 0; ic < maxClass; ic++) dataFile << results[ic][it][ir] << "\t";
        }
        dataFile << "\n";
    }
    dataFile.close();
    std::cout << "\nSuccessfuly saved results in " << dataFilename << "\n";

    return 0;
}


================================================
FILE: graphs/set-ll-1k.cpp
================================================
#include <iostream>
#include <fstream>
#include <cstring>
#include "BenchmarkSets.hpp"
#include "datastructures/linkedlists/MagedHarrisLinkedListSetHP.hpp"
#include "datastructures/linkedlists/MagedHarrisLinkedListSetHE.hpp"
#include "datastructures/linkedlists/OFLFLinkedListSet.hpp"
#include "datastructures/linkedlists/OFWFLinkedListSet.hpp"
// Macros suck, but it's either TL2 or TinySTM or ESTM, we can't have all at the same time
#if defined USE_TL2
#include "datastructures/linkedlists/TL2STMLinkedListSet.hpp"
#define DATA_FILENAME "data/set-ll-1k-tl2.txt"
#elif defined USE_TINY
#include "datastructures/linkedlists/TinySTMLinkedListSet.hpp"
#define DATA_FILENAME "data/set-ll-1k-tiny.txt"
#else
#include "datastructures/linkedlists/ESTMLinkedListSet.hpp"
#define DATA_FILENAME "data/set-ll-1k.txt"
#endif


int main(void) {
    const std::string dataFilename {DATA_FILENAME};
    vector<int> threadList = { 1, 2, 4, 8, 16, 32, 48, 64 };     // For the laptop or AWS c5.9xlarge
    vector<int> ratioList = { 1000, 500, 100, 10, 1, 0 };        // Permil ratio: 100%, 50%, 10%, 1%, 0.1%, 0%
    const int numElements = 1000;                                // Number of keys in the set
    const int numRuns = 1;                                       // 5 runs for the paper
    const seconds testLength = 20s;                              // 20s for the paper
    const int EMAX_CLASS = 10;
    uint64_t results[EMAX_CLASS][threadList.size()][ratioList.size()];
    std::string cNames[EMAX_CLASS];
    int maxClass = 0;
    // Reset results
    std::memset(results, 0, sizeof(uint64_t)*EMAX_CLASS*threadList.size()*ratioList.size());

    double totalHours = (double)EMAX_CLASS*ratioList.size()*threadList.size()*testLength.count()*numRuns/(60.*60.);
    std::cout << "This benchmark is going to take at most " << totalHours << " hours to complete\n";

    for (unsigned ir = 0; ir < ratioList.size(); ir++) {
        auto ratio = ratioList[ir];
        for (unsigned it = 0; it < threadList.size(); it++) {
            auto nThreads = threadList[it];
            int ic = 0;
            BenchmarkSets bench(nThreads);
            std::cout << "\n----- Sets (Linked-Lists)   numElements=" << numElements << "   ratio=" << ratio/10. << "%   threads=" << nThreads << "   runs=" << numRuns << "   length=" << testLength.count() << "s -----\n";
#if defined USE_TL2
            results[ic][it][ir] = bench.benchmark<TL2STMLinkedListSet<UserData>,UserData>            (cNames[ic], ratio, testLength, numRuns, numElements, false);
            ic++;
#elif defined USE_TINY
            results[ic][it][ir] = bench.benchmark<TinySTMLinkedListSet<UserData>,UserData>           (cNames[ic], ratio, testLength, numRuns, numElements, false);
            ic++;
#else
            results[ic][it][ir] = bench.benchmark<OFLFLinkedListSet<UserData>,UserData>              (cNames[ic], ratio, testLength, numRuns, numElements, false);
            ic++;
            results[ic][it][ir] = bench.benchmark<OFWFLinkedListSet<UserData>,UserData>              (cNames[ic], ratio, testLength, numRuns, numElements, false);
            ic++;
            results[ic][it][ir] = bench.benchmark<ESTMLinkedListSet<UserData>,UserData>              (cNames[ic], ratio, testLength, numRuns, numElements, false);
            ic++;
            results[ic][it][ir] = bench.benchmark<MagedHarrisLinkedListSetHP<UserData>,UserData>     (cNames[ic], ratio, testLength, numRuns, numElements, false);
            ic++;
            results[ic][it][ir] = bench.benchmark<MagedHarrisLinkedListSetHE<UserData>,UserData>     (cNames[ic], ratio, testLength, numRuns, numElements, false);
            ic++;
#endif
            maxClass = ic;
        }
    }

    // Export tab-separated values to a file to be imported in gnuplot or excel
    ofstream dataFile;
    dataFile.open(dataFilename);
    dataFile << "Threads\t";
    // Printf class names and ratios for each column
    for (unsigned iratio = 0; iratio < ratioList.size(); iratio++) {
        auto ratio = ratioList[iratio];
        for (int iclass = 0; iclass < maxClass; iclass++) dataFile << cNames[iclass] << "-" << ratio/10. << "%"<< "\t";
    }
    dataFile << "\n";
    for (int it = 0; it < threadList.size(); it++) {
        dataFile << threadList[it] << "\t";
        for (unsigned ir = 0; ir < ratioList.size(); ir++) {
            for (int ic = 0; ic < maxClass; ic++) dataFile << results[ic][it][ir] << "\t";
        }
        dataFile << "\n";
    }
    dataFile.close();
    std::cout << "\nSuccessfuly saved results in " << dataFilename << "\n";

    return 0;
}


================================================
FILE: graphs/set-tree-10k.cpp
================================================
#include <iostream>
#include <fstream>
#include <cstring>
#include "BenchmarkSets.hpp"
#include "datastructures/treemaps/NatarajanTreeHE.hpp"
#include "datastructures/treemaps/OFLFRedBlackTree.hpp"
#include "datastructures/treemaps/OFWFRedBlackTree.hpp"
// Macros suck, but it's either TinySTM or ESTM, we can't have both at the same time
#ifdef USE_TINY
#include "datastructures/treemaps/TinySTMRedBlackTree.hpp"
#define DATA_FILENAME "data/set-tree-10k-tiny.txt"
#else
#include "datastructures/treemaps/ESTMRedBlackTree.hpp"
#define DATA_FILENAME "data/set-tree-10k.txt"
#endif


int main(void) {
    const std::string dataFilename {DATA_FILENAME};
    vector<int> threadList = { 1, 2, 4, 8, 16, 32, 48, 64 };     // For the laptop or AWS c5.9xlarge
    vector<int> ratioList = { 1000, 500, 100, 10, 1, 0 };        // Permil ratio: 100%, 50%, 10%, 1%, 0.1%, 0%
    const int numElements = 10000;                               // Number of keys in the set
    const int numRuns = 1;                                       // 5 runs for the paper
    const seconds testLength = 20s;                              // 20s for the paper
    const int EMAX_CLASS = 10;
    uint64_t results[EMAX_CLASS][threadList.size()][ratioList.size()];
    std::string cNames[EMAX_CLASS];
    int maxClass = 0;
    // Reset results
    std::memset(results, 0, sizeof(uint64_t)*EMAX_CLASS*threadList.size()*ratioList.size());

    double totalHours = (double)EMAX_CLASS*ratioList.size()*threadList.size()*testLength.count()*numRuns/(60.*60.);
    std::cout << "This benchmark is going to take about " << totalHours << " hours to complete\n";

    for (unsigned iratio = 0; iratio < ratioList.size(); iratio++) {
        auto ratio = ratioList[iratio];
        for (unsigned it = 0; it < threadList.size(); it++) {
            auto nThreads = threadList[it];
            int ic = 0;
            BenchmarkSets bench(nThreads);
            std::cout << "\n----- Sets (Trees)   numElements=" << numElements << "   ratio=" << ratio/10. << "%   threads=" << nThreads << "   runs=" << numRuns << "   length=" << testLength.count() << "s -----\n";
#ifdef USE_TINY
            results[ic][it][iratio] = bench.benchmark<TinySTMRedBlackTree<uint64_t,uint64_t>,uint64_t>                                     (cNames[ic], ratio, testLength, numRuns, numElements, false);
            ic++;
#else
            results[ic][it][iratio] = bench.benchmark<OFLFRedBlackTree<uint64_t,uint64_t>,uint64_t>                                        (cNames[ic], ratio, testLength, numRuns, numElements, false);
            ic++;
            results[ic][it][iratio] = bench.benchmark<OFWFRedBlackTree<uint64_t,uint64_t>,uint64_t>                                        (cNames[ic], ratio, testLength, numRuns, numElements, false);
            ic++;
            results[ic][it][iratio] = bench.benchmark<ESTMRedBlackTree<uint64_t,uint64_t>,uint64_t>                                        (cNames[ic], ratio, testLength, numRuns, numElements, false);
            ic++;
            results[ic][it][iratio] = bench.benchmark<NatarajanTreeHE<uint64_t,uint64_t>,uint64_t>                                         (cNames[ic], ratio, testLength, numRuns, numElements, false);
            ic++;
#endif
            maxClass = ic;
        }
    }

    // Export tab-separated values to a file to be imported in gnuplot or excel
    ofstream dataFile;
    dataFile.open(dataFilename);
    dataFile << "Threads\t";
    // Printf class names and ratios for each column
    for (unsigned ir = 0; ir < ratioList.size(); ir++) {
        auto ratio = ratioList[ir];
        for (int ic = 0; ic < maxClass; ic++) dataFile << cNames[ic] << "-" << ratio/10. << "%"<< "\t";
    }
    dataFile << "\n";
    for (int it = 0; it < threadList.size(); it++) {
        dataFile << threadList[it] << "\t";
        for (unsigned ir = 0; ir < ratioList.size(); ir++) {
            for (int ic = 0; ic < maxClass; ic++) dataFile << results[ic][it][ir] << "\t";
        }
        dataFile << "\n";
    }
    dataFile.close();
    std::cout << "\nSuccessfuly saved results in " << dataFilename << "\n";

    return 0;
}


================================================
FILE: graphs/set-tree-1k.cpp
================================================
#include <iostream>
#include <fstream>
#include <cstring>
#include "BenchmarkSets.hpp"
#include "datastructures/treemaps/NatarajanTreeHE.hpp"
#include "datastructures/treemaps/OFLFRedBlackTree.hpp"
#include "datastructures/treemaps/OFWFRedBlackTree.hpp"
// Macros suck, but it's either TinySTM or ESTM, we can't have both at the same time
#ifdef USE_TINY
#include "datastructures/treemaps/TinySTMRedBlackTree.hpp"
#define DATA_FILENAME "data/set-tree-1k-tiny.txt"
#else
//#include "datastructures/trevor_brown_abtree/TrevorBrownABTree.hpp"
//#include "datastructures/trevor_brown_natarajan/TrevorBrownNatarajanTree.hpp"
#include "datastructures/treemaps/ESTMRedBlackTree.hpp"
#define DATA_FILENAME "data/set-tree-1k.txt"
#endif


int main(void) {
    const std::string dataFilename {DATA_FILENAME};
    vector<int> threadList = { 1, 2, 4, 8, 16, 32, 48, 64 };     // For the laptop or AWS c5.9xlarge
    vector<int> ratioList = { 1000, 500, 100, 10, 1, 0 };        // Permil ratio: 100%, 50%, 10%, 1%, 0.1%, 0%
    const int numElements = 1000;                                // Number of keys in the set
    const int numRuns = 1;                                       // 5 runs for the paper
    const seconds testLength = 20s;                              // 20s for the paper
    const int EMAX_CLASS = 10;
    uint64_t results[EMAX_CLASS][threadList.size()][ratioList.size()];
    std::string cNames[EMAX_CLASS];
    int maxClass = 0;
    // Reset results
    std::memset(results, 0, sizeof(uint64_t)*EMAX_CLASS*threadList.size()*ratioList.size());

    double totalHours = (double)EMAX_CLASS*ratioList.size()*threadList.size()*testLength.count()*numRuns/(60.*60.);
    std::cout << "This benchmark is going to take about " << totalHours << " hours to complete\n";

    for (unsigned iratio = 0; iratio < ratioList.size(); iratio++) {
        auto ratio = ratioList[iratio];
        for (unsigned it = 0; it < threadList.size(); it++) {
            auto nThreads = threadList[it];
            int ic = 0;
            BenchmarkSets bench(nThreads);
            std::cout << "\n----- Sets (Trees)   numElements=" << numElements << "   ratio=" << ratio/10. << "%   threads=" << nThreads << "   runs=" << numRuns << "   length=" << testLength.count() << "s -----\n";
#ifdef USE_TINY
            results[ic][it][iratio] = bench.benchmark<TinySTMRedBlackTree<uint64_t,uint64_t>,uint64_t>                                     (cNames[ic], ratio, testLength, numRuns, numElements, false);
            ic++;
#else
            results[ic][it][iratio] = bench.benchmark<OFLFRedBlackTree<uint64_t,uint64_t>,uint64_t>                                        (cNames[ic], ratio, testLength, numRuns, numElements, false);
            ic++;
            results[ic][it][iratio] = bench.benchmark<OFWFRedBlackTree<uint64_t,uint64_t>,uint64_t>                                        (cNames[ic], ratio, testLength, numRuns, numElements, false);
            ic++;
            results[ic][it][iratio] = bench.benchmark<ESTMRedBlackTree<uint64_t,uint64_t>,uint64_t>                                        (cNames[ic], ratio, testLength, numRuns, numElements, false);
            ic++;
            results[ic][it][iratio] = bench.benchmark<NatarajanTreeHE<uint64_t,uint64_t>,uint64_t>                                         (cNames[ic], ratio, testLength, numRuns, numElements, false);
            ic++;
            //results[ic][it][iratio] = bench.benchmark<TrevorBrownABTree<uint64_t>,uint64_t>                                                (cNames[ic], ratio, testLength, numRuns, numElements, false);
            //ic++;
            //results[ic][it][iratio] = bench.benchmark<TrevorBrownNatarajanTree<uint64_t>,uint64_t>                                         (cNames[ic], ratio, testLength, numRuns, numElements, false);
            //ic++;
#endif
            maxClass = ic;
        }
    }

    // Export tab-separated values to a file to be imported in gnuplot or excel
    ofstream dataFile;
    dataFile.open(dataFilename);
    dataFile << "Threads\t";
    // Printf class names and ratios for each column
    for (unsigned ir = 0; ir < ratioList.size(); ir++) {
        auto ratio = ratioList[ir];
        for (int ic = 0; ic < maxClass; ic++) dataFile << cNames[ic] << "-" << ratio/10. << "%"<< "\t";
    }
    dataFile << "\n";
    for (int it = 0; it < threadList.size(); it++) {
        dataFile << threadList[it] << "\t";
        for (unsigned ir = 0; ir < ratioList.size(); ir++) {
            for (int ic = 0; ic < maxClass; ic++) dataFile << results[ic][it][ir] << "\t";
        }
        dataFile << "\n";
    }
    dataFile.close();
    std::cout << "\nSuccessfuly saved results in " << dataFilename << "\n";

    return 0;
}


================================================
FILE: graphs/set-tree-1m.cpp
================================================
#include <iostream>
#include <fstream>
#include <cstring>
//#include "datastructures/treemaps/NatarajanTreeHE.hpp"
#include "datastructures/treemaps/OFLFRedBlackTree.hpp"
#include "datastructures/treemaps/OFWFRedBlackTree.hpp"
#include "BenchmarkSets.hpp"
// Macros suck, but it's either TinySTM or ESTM, we can't have both at the same time
#ifdef USE_TINY
#include "datastructures/treemaps/TinySTMRedBlackTree.hpp"
#define DATA_FILENAME "data/set-tree-1m-tiny.txt"
#else
#include "datastructures/treemaps/ESTMRedBlackTree.hpp"
#define DATA_FILENAME "data/set-tree-1m.txt"
#endif

int main(void) {
    const std::string dataFilename {DATA_FILENAME};
    vector<int> threadList = { 1, 2, 4, 8, 16, 32, 48, 64 };     // For the laptop or AWS c5.9xlarge
    vector<int> ratioList = { 1000, 500, 100, 10, 1, 0 };        // Permil ratio: 100%, 50%, 10%, 1%, 0.1%, 0%
    const int numElements = 1000*1000;                           // Number of keys in the set
    const int numRuns = 1;                                       // 5 runs for the paper
    const seconds testLength = 20s;                              // 20s for the paper
    const int EMAX_CLASS = 10;
    uint64_t results[EMAX_CLASS][threadList.size()][ratioList.size()];
    std::string cNames[EMAX_CLASS];
    int maxClass = 0;
    // Reset results
    std::memset(results, 0, sizeof(uint64_t)*EMAX_CLASS*threadList.size()*ratioList.size());

    double totalHours = (double)EMAX_CLASS*ratioList.size()*threadList.size()*testLength.count()*numRuns/(60.*60.);
    //std::cout << "This benchmark needs LARGER LOGS than the default settings to fit the 1M keys\n";
    //std::cout << "Make sure to increase TX_MAX_STORES to 32*1024*1024ULL\n";
    std::cout << "This benchmark is going to take about " << totalHours << " hours to complete\n";

    for (unsigned iratio = 0; iratio < ratioList.size(); iratio++) {
        auto ratio = ratioList[iratio];
        for (unsigned it = 0; it < threadList.size(); it++) {
            auto nThreads = threadList[it];
            int ic = 0;
            BenchmarkSets bench(nThreads);
            std::cout << "\n----- Sets (Trees)   numElements=" << numElements << "   ratio=" << ratio/10. << "%   threads=" << nThreads << "   runs=" << numRuns << "   length=" << testLength.count() << "s -----\n";
#ifdef USE_TINY
            results[ic][it][iratio] = bench.benchmark<TinySTMRedBlackTree<uint64_t,uint64_t>,uint64_t>                                     (cNames[ic], ratio, testLength, numRuns, numElements, false);
            ic++;
#else
            results[ic][it][iratio] = bench.benchmark<OFLFRedBlackTree<uint64_t,uint64_t>,uint64_t>                                        (cNames[ic], ratio, testLength, numRuns, numElements, false);
            ic++;
            results[ic][it][iratio] = bench.benchmark<OFWFRedBlackTree<uint64_t,uint64_t>,uint64_t>                                        (cNames[ic], ratio, testLength, numRuns, numElements, false);
            ic++;
            results[ic][it][iratio] = bench.benchmark<ESTMRedBlackTree<uint64_t,uint64_t>,uint64_t>                                        (cNames[ic], ratio, testLength, numRuns, numElements, false);
            ic++;
            //results[ic][it][iratio] = bench.benchmark<NatarajanTreeHE<uint64_t,uint64_t>,uint64_t>                                         (cNames[ic], ratio, testLength, numRuns, numElements, false);
            //ic++;
            //results[ic][it][iratio] = bench.benchmark<UCSet<CXMutationWF<AVLTree<uint64_t>>,AVLTree<uint64_t>,uint64_t>,uint64_t>          (cNames[ic], ratio, testLength, numRuns, numElements, false);
            //ic++;
            //results[ic][it][iratio] = bench.benchmark<UCSet<CXMutationWF<TreeSet<uint64_t>>,TreeSet<uint64_t>,uint64_t>,uint64_t>          (cNames[ic], ratio, testLength, numRuns, numElements, false);
            //ic++;
            //results[ic][it][iratio] = bench.benchmark<UCSet<CXMutationWFTimed<TreeSet<uint64_t>>,TreeSet<uint64_t>,uint64_t>,uint64_t>     (cNames[ic], ratio, testLength, numRuns, numElements, false);
            //ic++;
#endif
            maxClass = ic;
        }
    }

    // Export tab-separated values to a file to be imported in gnuplot or excel
    ofstream dataFile;
    dataFile.open(dataFilename);
    dataFile << "Threads\t";
    // Printf class names and ratios for each column
    for (unsigned ir = 0; ir < ratioList.size(); ir++) {
        auto ratio = ratioList[ir];
        for (int ic = 0; ic < maxClass; ic++) dataFile << cNames[ic] << "-" << ratio/10. << "%"<< "\t";
    }
    dataFile << "\n";
    for (int it = 0; it < threadList.size(); it++) {
        dataFile << threadList[it] << "\t";
        for (unsigned ir = 0; ir < ratioList.size(); ir++) {
            for (int ic = 0; ic < maxClass; ic++) dataFile << results[ic][it][ir] << "\t";
        }
        dataFile << "\n";
    }
    dataFile.close();
    std::cout << "\nSuccessfuly saved results in " << dataFilename << "\n";

    return 0;
}


================================================
FILE: graphs/sps-integer.cpp
================================================

/*
 * Executes SPS for the following STMs:
 * - OneFileLF (lock-free)
 * - OneFileWF (bounded wait-free)
 * - Elastic STM (blocking)
 * - TinySTM (blocking)
 * - TL2 (blocking)
 */
#include <iostream>
#include <fstream>
#include <cstring>
//#include "stms/CRWWPSTM.hpp"
#include "stms/OneFileLF.hpp"
#include "stms/OneFileWF.hpp"
#include "BenchmarkSPS.hpp"
// Macros suck, but it's either TL2 or TinySTM or ESTM, we can't have all at the same time
#if defined USE_TL2
#include "stms/TL2STM.hpp"
#define DATA_FILENAME "data/sps-integer-tl2.txt"
#elif defined USE_TINY
#include "stms/TinySTM.hpp"
#define DATA_FILENAME "data/sps-integer-tiny.txt"
#else
#include "stms/ESTM.hpp"
#define DATA_FILENAME "data/sps-integer.txt"
#endif


int main(void) {
    const std::string dataFilename {DATA_FILENAME};
    vector<int> threadList = { 1, 2, 4, 8, 16, 32, 48, 64 };         // For the laptop or AWS c5.9xlarge
    vector<long> swapsPerTxList = { 1, 4, 8, 16, 32, 64, 128, 256 }; // Number of swapped words per transaction
    const int numRuns = 1;                                           // 5 runs for the paper
    const seconds testLength = 20s;                                  // 20s for the paper
    const int EMAX_CLASS = 10;
    uint64_t results[EMAX_CLASS][threadList.size()][swapsPerTxList.size()];
    std::string cNames[EMAX_CLASS];
    int maxClass = 0;
    // Reset results
    std::memset(results, 0, sizeof(uint64_t)*EMAX_CLASS*threadList.size()*swapsPerTxList.size());

    // SPS Benchmarks multi-threaded
    std::cout << "This benchmark takes about " << (threadList.size()*swapsPerTxList.size()*numRuns*testLength.count()*3./(60*60)) << " hours to complete\n";
    std::cout << "\n----- SPS Benchmark (multi-threaded integer array swap) -----\n";
    for (int it = 0; it < threadList.size(); it++) {
        int nThreads = threadList[it];
        for (int iswaps = 0; iswaps < swapsPerTxList.size(); iswaps++) {
            int nWords = swapsPerTxList[iswaps];
            int ic = 0;
            BenchmarkSPS bench(nThreads);
            std::cout << "\n----- threads=" << nThreads << "   runs=" << numRuns << "   length=" << testLength.count() << "s   arraySize=" << arraySize << "   swaps/tx=" << nWords << " -----\n";
#if defined USE_TL2
            results[ic][it][iswaps] = bench.benchmarkSPSInteger<tl2stm::TL2STM,tl2stm::tmtype>         (cNames[ic], testLength, nWords, numRuns);
            ic++;
#elif defined USE_TINY
            // TinySTM starves out and blocks forever when there is too much contention
            if ((nThreads >= 16 && nWords >= 128) || (nThreads >= 32 && nWords >= 32) || nThreads >= 64) {
                cNames[ic] = tinystm::TinySTM::className();
                results[ic][it][iswaps] = 0;
            } else {
                results[ic][it][iswaps] = bench.benchmarkSPSInteger<tinystm::TinySTM,tinystm::tmtype>      (cNames[ic], testLength, nWords, numRuns);
            }
            ic++;
#else
            results[ic][it][iswaps] = bench.benchmarkSPSInteger<oflf::OneFileLF,oflf::tmtype>(cNames[ic], testLength, nWords, numRuns);
            ic++;
            results[ic][it][iswaps] = bench.benchmarkSPSInteger<ofwf::OneFileWF,ofwf::tmtype>(cNames[ic], testLength, nWords, numRuns);
            ic++;
            // ESTM starves out and blocks forever when there is too much contention
            if ((nThreads >= 16 && nWords >= 128) || (nThreads >= 32 && nWords >= 32) || nThreads >= 64) {
                cNames[ic] = estm::ESTM::className();
                results[ic][it][iswaps] = 0;
            } else {
                results[ic][it][iswaps] = bench.benchmarkSPSInteger<estm::ESTM,estm::tmtype>          (cNames[ic], testLength, nWords, numRuns);
            }
            ic++;
            //results[ic][it][iswaps] = bench.benchmarkSPSInteger<crwwpstm::CRWWPSTM,crwwpstm::tmtype>   (cNames[ic], testLength, nWords, numRuns);
            //iclass++;
#endif
            maxClass = ic;
        }
        std::cout << "\n";
    }

    // Export tab-separated values to a file to be imported in gnuplot or excel
    ofstream dataFile;
    dataFile.open(dataFilename);
    dataFile << "Swaps\t";
    // Printf class names for each column plus the corresponding thread
    for (int iclass = 0; iclass < maxClass; iclass++) {
        for (int ithread = 0; ithread < threadList.size(); ithread++) {
            int nThreads = threadList[ithread];
            dataFile << cNames[iclass] << "-" << nThreads <<"T\t";
        }
    }
    dataFile << "\n";
    for (int iswaps = 0; iswaps < swapsPerTxList.size(); iswaps++) {
        dataFile << swapsPerTxList[iswaps] << "\t";
        for (int iclass = 0; iclass < maxClass; iclass++) {
            for (int ithread = 0; ithread < threadList.size(); ithread++) {
                dataFile << results[iclass][ithread][iswaps] << "\t";
            }
        }
        dataFile << "\n";
    }
    dataFile.close();
    std::cout << "\nSuccessfuly saved results in " << dataFilename << "\n";

    return 0;
}


================================================
FILE: graphs/sps-object.cpp
================================================
/*
 * Executes SPS (with objects) for the following STMs:
 * - OneFileLF (lock-free)
 * - OneFileWF (bounded wait-free)
 * - Elastic STM (blocking)
 * - TinySTM (blocking)
 * - TL2 (blocking)
 */
#include <iostream>
#include <fstream>
#include <cstring>
#include "BenchmarkSPS.hpp"
//#include "stms/CRWWPSTM.hpp"
#include "stms/OneFileLF.hpp"
#include "stms/OneFileWF.hpp"
// Macros suck, but it's either TL2 or TinySTM or ESTM, we can't have all at the same time
#if defined USE_TL2
#include "stms/TL2STM.hpp"
#define DATA_FILENAME "data/sps-object-tl2.txt"
#elif defined USE_TINY
#include "stms/TinySTM.hpp"
#define DATA_FILENAME "data/sps-object-tiny.txt"
#else
#include "stms/ESTM.hpp"
#define DATA_FILENAME "data/sps-object.txt"
#endif


int main(void) {
    const std::string dataFilename {DATA_FILENAME};
    vector<int> threadList = { 1, 2, 4, 8, 16, 32, 48, 64 };         // For the laptop or AWS c5.9xlarge
    vector<long> swapsPerTxList = { 1, 4, 8, 16, 32, 64, 128, 256 }; // Number of replaced objects per transaction
    const int numRuns = 1;                                           // 5 runs for the paper
    const seconds testLength = 10s;                                  // 20s for the paper
    const int EMAX_CLASS = 10;
    uint64_t results[EMAX_CLASS][threadList.size()][swapsPerTxList.size()];
    std::string cNames[EMAX_CLASS];
    int maxClass = 0;
    // Reset results
    std::memset(results, 0, sizeof(uint64_t)*EMAX_CLASS*threadList.size()*swapsPerTxList.size());

    // SPS Objects Benchmarks multi-threaded
    std::cout << "This benchmark takes about " << (threadList.size()*swapsPerTxList.size()*numRuns*testLength.count()*3./(60*60)) << " hours to complete\n";
    std::cout << "\n----- SPS Benchmark (multi-threaded object array replace) -----\n";
    for (int ithread = 0; ithread < threadList.size(); ithread++) {
        int nThreads = threadList[ithread];
        for (int iswaps = 0; iswaps < swapsPerTxList.size(); iswaps++) {
            int nWords = swapsPerTxList[iswaps];
            int ic = 0;
            BenchmarkSPS bench(nThreads);
            std::cout << "\n----- threads=" << nThreads << "   runs=" << numRuns << "   length=" << testLength.count() << "s   arraySize=" << arraySize << "   swaps/tx=" << nWords << " -----\n";
#if defined USE_TL2
            results[ic][ithread][iswaps] = bench.benchmarkSPSObject<tl2stm::TL2STM,tl2stm::tmtype,tl2stm::tmbase>            (cNames[ic], testLength, nWords, numRuns);
            ic++;
#elif defined USE_TINY
            // TinySTM starves out and blocks forever when there is too much contention
            if ((nThreads >= 16 && nWords >= 128) || (nThreads >= 32 && nWords >= 32) || nThreads >= 64) {
                cNames[ic] = tinystm::TinySTM::className();
                results[ic][ithread][iswaps] = 0;
            } else {
                results[ic][ithread][iswaps] = bench.benchmarkSPSObject<tinystm::TinySTM,tinystm::tmtype,tinystm::tmbase>        (cNames[ic], testLength, nWords, numRuns);
            }
            ic++;
#else
            results[ic][ithread][iswaps] = bench.benchmarkSPSObject<oflf::OneFileLF,oflf::tmtype,oflf::tmbase>(cNames[ic], testLength, nWords, numRuns);
            ic++;
            results[ic][ithread][iswaps] = bench.benchmarkSPSObject<ofwf::OneFileWF,ofwf::tmtype,ofwf::tmbase>(cNames[ic], testLength, nWords, numRuns);
            ic++;
            // ESTM starves out and blocks forever when there is too much contention
            if ((nThreads >= 16 && nWords >= 128) || (nThreads >= 32 && nWords >= 32) || nThreads >= 64) {
                cNames[ic] = estm::ESTM::className();
                results[ic][ithread][iswaps] = 0;
            } else {
                results[ic][ithread][iswaps] = bench.benchmarkSPSObject<estm::ESTM,estm::tmtype,estm::tmbase>                (cNames[ic], testLength, nWords, numRuns);
            }
            ic++;
            //results[iclass][ithread][iswaps] = bench.benchmarkSPSObject<crwwpstm::CRWWPSTM,crwwpstm::tmtype,crwwpstm::tmtbase>    (cNames[iclass], testLength, nWords, numRuns);
            //iclass++;
#endif
            maxClass = ic;
        }
        std::cout << "\n";
    }

    // Export tab-separated values to a file to be imported in gnuplot or excel
    ofstream dataFile;
    dataFile.open(dataFilename);
    dataFile << "Swaps\t";
    // Printf class names for each column plus the corresponding thread
    for (int ic = 0; ic < maxClass; ic++) {
        for (int it = 0; it < threadList.size(); it++) {
            int nThreads = threadList[it];
            dataFile << cNames[ic] << "-" << nThreads <<"T\t";
        }
    }
    dataFile << "\n";
    for (int is = 0; is < swapsPerTxList.size(); is++) {
        dataFile << swapsPerTxList[is] << "\t";
        for (int ic = 0; ic < maxClass; ic++) {
            for (int it = 0; it < threadList.size(); it++) {
                dataFile << results[ic][it][is] << "\t";
            }
        }
        dataFile << "\n";
    }
    dataFile.close();
    std::cout << "\nSuccessfuly saved results in " << dataFilename << "\n";

    return 0;
}


================================================
FILE: pdatastructures/README.md
================================================
# Persistent Data Structures #

This is a copy of cxtm/pdatastructures
If you change something here then make sure to make the same changes in cxtm/pdatastructures so that they're both in sync.

================================================
FILE: pdatastructures/TMHashMap.hpp
================================================
#ifndef _PERSISTENT_TM_RESIZABLE_HASH_MAP_H_
#define _PERSISTENT_TM_RESIZABLE_HASH_MAP_H_

#include <string>

/**
 * <h1> A Resizable Hash Map for PTMs </h1>
 */
template<typename K, typename V, typename TM, template <typename> class TMTYPE>
class TMHashMap {

private:
    struct Node {
        TMTYPE<K>     key;
        TMTYPE<V>     val;
        TMTYPE<Node*> next {nullptr};
        Node(const K& k, const V& v) : key{k}, val{v} { } // Copy constructor for k and value
        Node() {}
    };


    TMTYPE<uint64_t>                    capacity;
    TMTYPE<uint64_t>                    sizeHM = 0;
    //TMTYPE<double>					loadFactor = 0.75;
    static constexpr double             loadFactor = 0.75;
    alignas(128) TMTYPE<TMTYPE<Node*>*> buckets;      // An array of pointers to Nodes


public:
    TMHashMap(uint64_t capacity=4) : capacity{capacity} {
		buckets = (TMTYPE<Node*>*)TM::pmalloc(capacity*sizeof(TMTYPE<Node*>));
		for (int i = 0; i < capacity; i++) buckets[i]=nullptr;
    }


    ~TMHashMap() {
		for(int i = 0; i < capacity; i++){
			Node* node = buckets[i];
			while (node!=nullptr) {
				Node* next = node->next;
				TM::tmDelete(node);
				node = next;
			}
		}
		TM::pfree(buckets);
    }


    static std::string className() { return TM::className() + "-HashMap"; }


    void rebuild() {
        uint64_t newcapacity = 2*capacity;
        //printf("increasing capacity to %d\n", newcapacity);
        TMTYPE<Node*>* newbuckets = (TMTYPE<Node*>*)TM::pmalloc(newcapacity*sizeof(TMTYPE<Node*>));
        for (int i = 0; i < newcapacity; i++) newbuckets[i] = nullptr;
        for (int i = 0; i < capacity; i++) {
            Node* node = buckets[i];
            while(node!=nullptr){
                Node* next = node->next;
                auto h = std::hash<K>{}(node->key) % newcapacity;
                node->next = newbuckets[h];
                newbuckets[h] = node;
                node = next;
            }
        }
        TM::pfree(buckets);
        buckets = newbuckets;
        capacity = newcapacity;
    }


    /*
     * Adds a node with a key if the key is not present, otherwise replaces the value.
     * If saveOldValue is set, it will set 'oldValue' to the previous value, iff there was already a mapping.
     *
     * Returns true if there was no mapping for the key, false if there was already a value and it was replaced.
     */
    bool innerPut(const K& key, const V& value, V& oldValue, const bool saveOldValue) {
    	//printf("innerPut %d %d %f\n", sizeHM.pload(), capacity.pload(), loadFactor.pload()*capacity.pload());
        if (sizeHM.pload() > capacity.pload()*loadFactor) rebuild();
        auto h = std::hash<K>{}(key) % capacity;
        Node* node = buckets[h];
        Node* prev = node;
        while (true) {
            if (node == nullptr) {
                Node* newnode = TM::template tmNew<Node>(key,value);
                //Node* newnode = TM::template tmNew<Node>();
                //newnode->key = key;
                //newnode->val = value;
                //newnode->next = nullptr;
                if (node == prev) {
                    buckets[h] = newnode;
                } else {
                    prev->next = newnode;
                }
                sizeHM=sizeHM+1;
                return true;  // New insertion
            }
            if (key == node->key) {
                if (saveOldValue) oldValue = node->val; // Makes a copy of V
                node->val = value;
                return false; // Replace value for existing key
            }
            prev = node;
            node = node->next;
        }
    }


    /*
     * Removes a key and its mapping.
     * Saves the value in 'oldvalue' if 'saveOldValue' is set.
     *
     * Returns returns true if a matching key was found
     */
    bool innerRemove(const K& key, V& oldValue, const bool saveOldValue) {
        auto h = std::hash<K>{}(key) % capacity;
        Node* node = buckets[h];
        Node* prev = node;
        while (true) {
            if (node == nullptr) return false;
            if (key == node->key) {
                if (saveOldValue) oldValue = node->val; // Makes a copy of V
                if (node == prev) {
                    buckets[h] = node->next;
                } else {
                    prev->next = node->next;
                }
                sizeHM=sizeHM-1;
                TM::tmDelete(node);
                return true;
            }
            prev = node;
            node = node->next;
        }
    }


    /*
     * Returns true if key is present. Saves a copy of 'value' in 'oldValue' if 'saveOldValue' is set.
     */
    bool innerGet(const K& key, V& oldValue, const bool saveOldValue) {
        auto h = std::hash<K>{}(key) % capacity;
        Node* node = buckets[h];
        while (true) {
            if (node == nullptr) return false;
            if (key == node->key) {
                if (saveOldValue) oldValue = node->val; // Makes a copy of V
                return true;
            }
            node = node->next;
        }
    }


    //
    // Set methods for running the usual tests and benchmarks
    //

    // Inserts a key only if it's not already present
    bool add(const K& key) {
        return TM::template updateTx<bool>([this,key] () {
            V notused;
            return innerPut(key,key,notused,false);
        });
    }

    // Returns true only if the key was present
    bool remove(const K& key) {
        return TM::template updateTx<bool>([this,key] () {
            V notused;
            return innerRemove(key,notused,false);
        });
    }

    bool contains(const K& key) {
        return TM::template readTx<bool>([this,key] () {
            V notused;
            return innerGet(key,notused,false);
        });
    }

    // Used only for benchmarks
    bool addAll(K** keys, const int size) {
        for (int i = 0; i < size; i++) add(*keys[i]);
        return true;
    }
};

#endif /* _PERSISTENT_TM_RESIZABLE_HASH_MAP_H_ */


================================================
FILE: pdatastructures/TMHashMapByRef.hpp
================================================
#ifndef _PERSISTENT_TM_RESIZABLE_HASH_MAPBYREF_H_
#define _PERSISTENT_TM_RESIZABLE_HASH_MAPBYREF_H_

#include <string>

/**
 * <h1> A Resizable Hash Map for PTMs </h1>
 */
template<typename K, typename V, typename TM, template <typename> class TMTYPE>
class TMHashMapByRef {

private:
    struct Node {
        TMTYPE<K>     key;
        TMTYPE<V>     val;
        TMTYPE<Node*> next {nullptr};
        Node(const K& k, const V& v) : key{k}, val{v} { } // Copy constructor for k and value
        Node() {}
    };


    TMTYPE<uint64_t>                    capacity;
    TMTYPE<uint64_t>                    sizeHM = 0;
    //TMTYPE<double>					loadFactor = 0.75;
    static constexpr double             loadFactor = 0.75;
    alignas(128) TMTYPE<TMTYPE<Node*>*> buckets;      // An array of pointers to Nodes


public:
    TMHashMapByRef(uint64_t capacity=4) : capacity{capacity} {
		buckets = (TMTYPE<Node*>*)TM::pmalloc(capacity*sizeof(TMTYPE<Node*>));
		for (int i = 0; i < capacity; i++) buckets[i]=nullptr;
    }


    ~TMHashMapByRef() {
		for(int i = 0; i < capacity; i++){
			Node* node = buckets[i];
			while (node!=nullptr) {
				Node* next = node->next;
				TM::tmDelete(node);
				node = next;
			}
		}
		TM::pfree(buckets);
    }


    static std::string className() { return TM::className() + "-HashMap"; }


    void rebuild() {
        uint64_t newcapacity = 2*capacity;
        //printf("increasing capacity to %d\n", newcapacity);
        TMTYPE<Node*>* newbuckets = (TMTYPE<Node*>*)TM::pmalloc(newcapacity*sizeof(TMTYPE<Node*>));
        for (int i = 0; i < newcapacity; i++) newbuckets[i] = nullptr;
        for (int i = 0; i < capacity; i++) {
            Node* node = buckets[i];
            while(node!=nullptr){
                Node* next = node->next;
                auto h = std::hash<K>{}(node->key) % newcapacity;
                node->next = newbuckets[h];
                newbuckets[h] = node;
                node = next;
            }
        }
        TM::pfree(buckets);
        buckets = newbuckets;
        capacity = newcapacity;
    }


    /*
     * Adds a node with a key if the key is not present, otherwise replaces the value.
     * If saveOldValue is set, it will set 'oldValue' to the previous value, iff there was already a mapping.
     *
     * Returns true if there was no mapping for the key, false if there was already a value and it was replaced.
     */
    bool innerPut(const K& key, const V& value, V& oldValue, const bool saveOldValue) {
    	//printf("innerPut %d %d %f\n", sizeHM.pload(), capacity.pload(), loadFactor.pload()*capacity.pload());
        if (sizeHM.pload() > capacity.pload()*loadFactor) rebuild();
        auto h = std::hash<K>{}(key) % capacity;
        Node* node = buckets[h];
        Node* prev = node;
        while (true) {
            if (node == nullptr) {
                Node* newnode = TM::template tmNew<Node>(key,value);
                //Node* newnode = TM::template tmNew<Node>();
                //newnode->key = key;
                //newnode->val = value;
                //newnode->next = nullptr;
                if (node == prev) {
                    buckets[h] = newnode;
                } else {
                    prev->next = newnode;
                }
                sizeHM=sizeHM+1;
                return true;  // New insertion
            }
            if (key == node->key) {
                if (saveOldValue) oldValue = node->val; // Makes a copy of V
                node->val = value;
                return false; // Replace value for existing key
            }
            prev = node;
            node = node->next;
        }
    }


    /*
     * Removes a key and its mapping.
     * Saves the value in 'oldvalue' if 'saveOldValue' is set.
     *
     * Returns returns true if a matching key was found
     */
    bool innerRemove(const K& key, V& oldValue, const bool saveOldValue) {
        auto h = std::hash<K>{}(key) % capacity;
        Node* node = buckets[h];
        Node* prev = node;
        while (true) {
            if (node == nullptr) return false;
            if (key == node->key) {
                if (saveOldValue) oldValue = node->val; // Makes a copy of V
                if (node == prev) {
                    buckets[h] = node->next;
                } else {
                    prev->next = node->next;
                }
                sizeHM=sizeHM-1;
                TM::tmDelete(node);
                return true;
            }
            prev = node;
            node = node->next;
        }
    }


    /*
     * Returns true if key is present. Saves a copy of 'value' in 'oldValue' if 'saveOldValue' is set.
     */
    bool innerGet(const K& key, V& oldValue, const bool saveOldValue) {
        auto h = std::hash<K>{}(key) % capacity;
        Node* node = buckets[h];
        while (true) {
            if (node == nullptr) return false;
            if (key == node->key) {
                if (saveOldValue) oldValue = node->val; // Makes a copy of V
                return true;
            }
            node = node->next;
        }
    }


    //
    // Set methods for running the usual tests and benchmarks
    //

    // Inserts a key only if it's not already present
    bool add(const K& key) {
        bool retval = false;
        TM::template updateTx<bool>([&] () {
            V notused;
            retval = innerPut(key,key,notused,false);
        });
        return retval;
    }

    // Returns true only if the key was present
    bool remove(const K& key) {
        bool retval = false;
        TM::template updateTx<bool>([&] () {
            V notused;
            retval = innerRemove(key,notused,false);
        });
        return retval;
    }

    bool contains(const K& key) {
        bool retval = false;
        TM::template readTx<bool>([&] () {
            V notused;
            retval = innerGet(key,notused,false);
        });
        return retval;
    }

    // Used only for benchmarks
    bool addAll(K** keys, const int size) {
        for (int i = 0; i < size; i++) add(*keys[i]);
        return true;
    }
};

#endif /* _PERSISTENT_TM_RESIZABLE_HASH_MAPByRef_H_ */


================================================
FILE: pdatastructures/TMLinkedListQueue.hpp
================================================
#ifndef _TM_LINKED_LIST_QUEUE_H_
#define _TM_LINKED_LIST_QUEUE_H_

#include <string>


/**
 * <h1> A Linked List queue (memory unbounded) for usage with STMs and PTMs </h1>
 *
 */
template<typename T, typename TM, template <typename> class TMTYPE>
class TMLinkedListQueue {

private:
    struct Node {
        TMTYPE<T*>    item;
        TMTYPE<Node*> next {nullptr};
        Node(T* userItem) : item{userItem} { }
    };

    alignas(128) TMTYPE<Node*>  head {nullptr};
    alignas(128) TMTYPE<Node*>  tail {nullptr};


public:
    TMLinkedListQueue() {
		Node* sentinelNode = TM::template tmNew<Node>(nullptr);
		head = sentinelNode;
		tail = sentinelNode;
    }


    ~TMLinkedListQueue() {
		while (dequeue() != nullptr); // Drain the queue
		Node* lhead = head;
		TM::tmDelete(lhead);
    }


    static std::string className() { return TM::className() + "-LinkedListQueue"; }


    bool enqueue(T* item) {
        return TM::template updateTx<bool>([=] () {
            Node* newNode = TM::template tmNew<Node>(item);
            tail->next = newNode;
            tail = newNode;
            return true;
        });
    }


    T* dequeue() {
        return TM::template updateTx<T*>([=] () -> T* {
            Node* lhead = head;
            if (lhead == tail) return nullptr;
            head = lhead->next;
            TM::tmDelete(lhead);
            return head->item;
        });
    }
};

#endif /* _TM_LINKED_LIST_QUEUE_H_ */


================================================
FILE: pdatastructures/TMLinkedListSet.hpp
================================================
#ifndef _PERSISTENT_TM_LINKED_LIST_SET_H_
#define _PERSISTENT_TM_LINKED_LIST_SET_H_

#include <string>


/**
 * <h1> A Linked List Set meant to be used with PTMs </h1>
 */
template<typename K, typename TM, template <typename> class TMTYPE>
class TMLinkedListSet {

private:
    struct Node {
        TMTYPE<K>     key;
        TMTYPE<Node*> next {nullptr};
        Node(const K& key) : key{key} { }
        Node(){ }
    };

    alignas(128) TMTYPE<Node*>  head {nullptr};
    alignas(128) TMTYPE<Node*>  tail {nullptr};


public:
    TMLinkedListSet() {
        TM::template updateTx<bool>([=] () {
            Node* lhead = TM::template tmNew<Node>();
            Node* ltail = TM::template tmNew<Node>();
            head = lhead;
            head->next = ltail;
            tail = ltail;
            return true; // Needed for CX
        });
    }

    ~TMLinkedListSet() {
        TM::template updateTx<bool>([=] () {
            // Delete all the nodes in the list
            Node* prev = head;
            Node* node = prev->next;
            while (node != tail) {
                TM::tmDelete(prev);
                prev = node;
                node = node->next;
            }
            TM::tmDelete(prev);
            TM::tmDelete(tail.pload());
            return true; // Needed for CX
        });
    }

    static std::string className() { return TM::className() + "-LinkedListSet"; }

    /*
     * Adds a node with a key, returns false if the key is already in the set
     */
    bool add(K key) {
        return TM::template updateTx<bool>([=] () {
            K lkey = key;
            Node *prev, *node;
            find(lkey, prev, node);
            if (node != tail && lkey == node->key) return false;
            Node* newNode = TM::template tmNew<Node>(lkey);
            //Node* newNode = TM::template tmNew<Node>(); newNode->key = lkey; newNode->next = nullptr;
            prev->next = newNode;
            newNode->next = node;
            return true;
        });
    }

    /*
     * Removes a node with an key, returns false if the key is not in the set
     */
    bool remove(K key) {
        return TM::template updateTx<bool>([=] () {
            K lkey = key;
            Node *prev, *node;
            find(lkey, prev, node);
            if (!(node != tail && lkey == node->key)) return false;
            prev->next = node->next;
            TM::tmDelete(node);
            return true;
        });
    }

    /*
     * Returns true if it finds a node with a matching key
     */
    bool contains(K key) {
        return TM::template readTx<bool>([=] () {
            K lkey = key;
            Node *prev, *node;
            find(lkey, prev, node);
            return (node != tail && lkey == node->key);
        });
    }

    void find(const K& lkey, Node*& prev, Node*& node) {
        Node* ltail = tail;
        for (prev = head; (node = prev->next) != ltail; prev = node) {
            if ( !(node->key < lkey) ) break;
        }
    }

    // Used only for benchmarks
    bool addAll(K** keys, const int size) {
        for (int i = 0; i < size; i++) add(*keys[i]);
        return true;
    }
};

#endif /* _PERSISTENT_TM_LINKED_LIST_SET_H_ */


================================================
FILE: pdatastructures/TMLinkedListSetByRef.hpp
================================================
#ifndef _PERSISTENT_TM_LINKED_LIST_SETBYREF_H_
#define _PERSISTENT_TM_LINKED_LIST_SETBYREF_H_

#include <string>


/**
 * <h1> A Linked List Set meant to be used with PTMs </h1>
 */
template<typename K, typename TM, template <typename> class TMTYPE>
class TMLinkedListSetByRef {

private:
    struct Node {
        TMTYPE<K>     key;
        TMTYPE<Node*> next {nullptr};
        Node(const K& key) : key{key} { }
        Node(){ }
    };

    alignas(128) TMTYPE<Node*>  head {nullptr};
    alignas(128) TMTYPE<Node*>  tail {nullptr};


public:
    TMLinkedListSetByRef() {
        TM::template updateTx<bool>([=] () {
            Node* lhead = TM::template tmNew<Node>();
            Node* ltail = TM::template tmNew<Node>();
            head = lhead;
            head->next = ltail;
            tail = ltail;
            return true; // Needed for CX
        });
    }

    ~TMLinkedListSetByRef() {
        TM::template updateTx<bool>([=] () {
            // Delete all the nodes in the list
            Node* prev = head;
            Node* node = prev->next;
            while (node != tail) {
                TM::tmDelete(prev);
                prev = node;
                node = node->next;
            }
            TM::tmDelete(prev);
            TM::tmDelete(tail.pload());
            return true; // Needed for CX
        });
    }

    static std::string className() { return TM::className() + "-LinkedListSet"; }

    /*
     * Adds a node with a key, returns false if the key is already in the set
     */
    bool add(K key) {
        bool retval = false;
        TM::template updateTx<bool>([&] () {
            Node *prev, *node;
            find(key, prev, node);
            retval = !(node != tail && key == node->key);
            if (!retval) return;
            Node* newNode = TM::template tmNew<Node>(key);
            prev->next = newNode;
            newNode->next = node;
        });
        return retval;
    }

    /*
     * Removes a node with an key, returns false if the key is not in the set
     */
    bool remove(K key) {
        bool retval = false;
        TM::template updateTx<bool>([&] () {
            Node *prev, *node;
            find(key, prev, node);
            retval = (node != tail && key == node->key);
            if (!retval) return;
            prev->next = node->next;
            TM::tmDelete(node);
        });
        return retval;
    }

    /*
     * Returns true if it finds a node with a matching key
     */
    bool contains(K key) {
        bool retval = false;
        TM::template readTx<bool>([&] () {
            Node *prev, *node;
            find(key, prev, node);
            retval = (node != tail && key == node->key);
        });
        return retval;
    }

    void find(const K& lkey, Node*& prev, Node*& node) {
        Node* ltail = tail;
        for (prev = head; (node = prev->next) != ltail; prev = node) {
            if ( !(node->key < lkey) ) break;
        }
    }

    // Used only for benchmarks
    bool addAll(K** keys, const int size) {
        for (int i = 0; i < size; i++) add(*keys[i]);
        return true;
    }
};

#endif /* _PERSISTENT_TM_LINKED_LIST_SETBYREF_H_ */


================================================
FILE: pdatastructures/TMRedBlackTree.hpp
================================================
#ifndef _PERSISTENT_TM_RED_BLACK_BST_H_
#define _PERSISTENT_TM_RED_BLACK_BST_H_

#include <cassert>
#include <stdexcept>
#include <algorithm>


// Adapted from Java to C++ from the original at http://algs4.cs.princeton.edu/code/edu/princeton/cs/algs4/RedBlackBST.java
template<typename K, typename V, typename TM, template <typename> class TMTYPE>
class TMRedBlackTree {
    const int64_t COLOR_RED   = 0;
    const int64_t COLOR_BLACK = 1;

    struct Node {
        TMTYPE<K>       key;
        TMTYPE<V>       val;
        TMTYPE<Node*>   left {nullptr};
        TMTYPE<Node*>   right {nullptr};
        TMTYPE<int64_t> color;    // color of parent link
        TMTYPE<int64_t> size;     // subtree count
        Node(const K& key, const V& val, int64_t color, int64_t size) : key{key}, val{val}, color{color}, size{size} {}
        Node() {}
    };

    TMTYPE<Node*> root {nullptr};   // root of the BST

    inline void assignAndFreeIfNull(TMTYPE<Node*>& z, Node* w) {
        Node* tofree = z;
        z = w;
        if (w == nullptr) TM::tmDelete(tofree);
    }

public:
    /**
     * Initializes an empty symbol table.
     */
    TMRedBlackTree(int numThreads=0){ }

    ~TMRedBlackTree() {
        TM::template updateTx<bool>([=] () {
            if (root == nullptr) return true;
            deleteAll(root);
            return true;
        });
    }

    void deleteAll(Node* rt){
        Node* left = rt->left;
        if(left!=nullptr){
            deleteAll(left);
        }
        Node* right = rt->right;
        if(right!=nullptr){
            deleteAll(right);
        }
        TM::tmDelete(rt);
        return;
    }

    /***************************************************************************
     *  Node helper methods.
     ***************************************************************************/
    // is node x red; false if x is null ?
    bool isRed(Node* x) {
        if (x == nullptr) return false;
        return x->color == COLOR_RED;
    }

    // number of node in subtree rooted at x; 0 if x is null
    int size(Node* x) {
        if (x == nullptr) return 0;
        return x->size;
    }


    /**
     * Returns the number of key-value pairs in this symbol table.
     * @return the number of key-value pairs in this symbol table
     */
    int size() {
        return size(root);
    }

    /**
     * Is this symbol table empty?
     * @return {@code true} if this symbol table is empty and {@code false} otherwise
     */
    bool isEmpty() {
        return root == nullptr;
    }


    /***************************************************************************
     *  Standard BST search->
     ***************************************************************************/

    /**
     * Returns the value associated with the given key.
     * @param key the key
     * @return the value associated with the given key if the key is in the symbol table
     *     and {@code null} if the key is not in the symbol table
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    bool innerGet(K key, V& oldValue, const bool saveOldValue) {
        bool found = get(root, key);
        if (!found) return false;
        //if (saveOldValue) oldValue = *val; // Copy of V
        return true;
    }

    // value associated with the given key in subtree rooted at x; null if no such key
    bool get(Node* x, K& key) {
        while (x != nullptr) {
            if      (key < x->key) x = x->left;
            else if (x->key < key) x = x->right;
            else              return true;
        }
        return false;
    }

    /**
     * Does this symbol table contain the given key?
     * @param key the key
     * @return {@code true} if this symbol table contains {@code key} and
     *     {@code false} otherwise
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    bool containsKey(const K& key) {
        return get(key) != nullptr;
    }

    /***************************************************************************
     *  Red-black tree insertion.
     ***************************************************************************/

    /**
     * Inserts the specified key-value pair into the symbol table, overwriting the old
     * value with the new value if the symbol table already contains the specified key.
     * Deletes the specified key (and its associated value) from this symbol table
     * if the specified value is {@code null}.
     *
     * @param key the key
     * @param val the value
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    bool innerPut(const K& key, const V& value) {
    	bool ret = false;
        root = put(root, key, value, ret);
        root->color = COLOR_BLACK;
        return ret;
    }

    // insert the key-value pair in the subtree rooted at h
    Node* put(Node* h, const K& key, const V& val, bool& ret) {
        if (h == nullptr) {
            ret = true;
            return TM::template tmNew<Node>(key, val, COLOR_RED, 1);
        }
        if      (key < h->key) h->left  = put(h->left,  key, val, ret);
        else if (h->key < key) h->right = put(h->right, key, val, ret);
        else              h->val   = val;
        // fix-up any right-leaning links
        if (isRed(h->right) && !isRed(h->left))       h = rotateLeft(h);
        if (isRed(h->left)  &&  isRed(h->left->left)) h = rotateRight(h);
        if (isRed(h->left)  &&  isRed(h->right))      flipColors(h);
        h->size = size(h->left) + size(h->right) + 1;

        return h;
    }

    /***************************************************************************
     *  Red-black tree deletion.
     ***************************************************************************/

    /**
     * Removes the smallest key and associated value from the symbol table.
     * @throws NoSuchElementException if the symbol table is empty
     */
    void deleteMin() {
        if (isEmpty()) return;
        // if both children of root are black, set root to red
        if (!isRed(root->left) && !isRed(root->right))
            root->color = COLOR_RED;
        assignAndFreeIfNull(root, deleteMin(root));
        if (!isEmpty()) root->color = COLOR_BLACK;
        // assert check();
    }

    // delete the key-value pair with the minimum key rooted at h
    Node* deleteMin(Node* h) {
        if (h->left == nullptr)
            return nullptr;
        if (!isRed(h->left) && !isRed(h->left->left))
            h = moveRedLeft(h);
        assignAndFreeIfNull(h->left, deleteMin(h->left));
        return balance(h);
    }


    /**
     * Removes the largest key and associated value from the symbol table.
     * @throws NoSuchElementException if the symbol table is empty
     */
    void deleteMax() {
        if (isEmpty()) return;

        // if both children of root are black, set root to red
        if (!isRed(root->left) && !isRed(root->right))
            root->color = COLOR_RED;

        root = deleteMax(root);
        if (!isEmpty()) root->color = COLOR_BLACK;
        // assert check();
    }

    // delete the key-value pair with the maximum key rooted at h
    Node* deleteMax(Node* h) {
        if (isRed(h->left))
            h = rotateRight(h);

        if (h->right == nullptr)
            return nullptr;

        if (!isRed(h->right) && !isRed(h->right->left))
            h = moveRedRight(h);

        h->right = deleteMax(h->right);

        return balance(h);
    }

    /**
     * Removes the specified key and its associated value from this symbol table
     * (if the key is in this symbol table).
     *
     * @param  key the key
     */
    void innerRemove(K key) {
        // if both children of root are black, set root to red
        if (!isRed(root->left) && !isRed(root->right)) root->color = COLOR_RED;
        assignAndFreeIfNull(root, deleteKey(root, key));
        if (!isEmpty()) root->color = COLOR_BLACK;
        // assert check();
    }

    // delete the key-value pair with the given key rooted at h
    Node* deleteKey(Node* h, const K& key) {
        // assert get(h, key) != null;
        if (key < h->key)  {
            if (!isRed(h->left) && !isRed(h->left->left)) {
                h = moveRedLeft(h);
            }
            assignAndFreeIfNull(h->left, deleteKey(h->left, key));
        } else {
            if (isRed(h->left)) {
                h = rotateRight(h);
            }
            if (key == h->key && (h->right == nullptr)) {
                return nullptr;
            }
            if (!isRed(h->right) && !isRed(h->right->left)) {
                h = moveRedRight(h);
            }
            if (key == h->key) {
                Node* x = min(h->right);
                h->key = x->key;
                h->val = x->val;
                // h->val = get(h->right, min(h->right).key);
                // h->key = min(h->right).key;
                assignAndFreeIfNull(h->right, deleteMin(h->right));
            } else {
                assignAndFreeIfNull(h->right, deleteKey(h->right, key));
            }
        }
        return balance(h);
    }

    /***************************************************************************
     *  Red-black tree helper functions.
     ***************************************************************************/

    // make a left-leaning link lean to the right
    Node* rotateRight(Node* h) {
        // assert (h != null) && isRed(h->left);
        Node* x = h->left;
        h->left = x->right;
        x->right = h;
        x->color = x->right->color;
        x->right->color = COLOR_RED;
        x->size = h->size;
        h->size = size(h->left) + size(h->right) + 1;
        return x;
    }

    // make a right-leaning link lean to the left
    Node* rotateLeft(Node* h) {
        // assert (h != null) && isRed(h->right);
        Node* x = h->right;
        h->right = x->left;
        x->left = h;
        x->color = x->left->color;
        x->left->color = COLOR_RED;
        x->size = h->size;
        h->size = size(h->left) + size(h->right) + 1;
        return x;
    }

    // flip the colors of a node and its two children
    void flipColors(Node* h) {
        // h must have opposite color of its two children
        // assert (h != null) && (h->left != null) && (h->right != null);
        // assert (!isRed(h) &&  isRed(h->left) &&  isRed(h->right))
        //    || (isRed(h)  && !isRed(h->left) && !isRed(h->right));
        h->color = !h->color;
        h->left->color = !h->left->color;
        h->right->color = !h->right->color;
    }

    // Assuming that h is red and both h->left and h->left.left
    // are black, make h->left or one of its children red.
    Node* moveRedLeft(Node* h) {
        // assert (h != null);
        // assert isRed(h) && !isRed(h->left) && !isRed(h->left.left);

        flipColors(h);
        if (isRed(h->right->left)) {
            h->right = rotateRight(h->right);
            h = rotateLeft(h);
            flipColors(h);
        }
        return h;
    }

    // Assuming that h is red and both h->right and h->right.left
    // are black, make h->right or one of its children red.
    Node* moveRedRight(Node* h) {
        // assert (h != null);
        // assert isRed(h) && !isRed(h->right) && !isRed(h->right.left);
        flipColors(h);
        if (isRed(h->left->left)) {
            h = rotateRight(h);
            flipColors(h);
        }
        return h;
    }

    // restore red-black tree invariant
    Node* balance(Node* h) {
        // assert (h != null);

        if (isRed(h->right))                        h = rotateLeft(h);
        if (isRed(h->left) && isRed(h->left->left)) h = rotateRight(h);
        if (isRed(h->left) && isRed(h->right))      flipColors(h);

        h->size = size(h->left) + size(h->right) + 1;
        return h;
    }


    /***************************************************************************
     *  Utility functions.
     ***************************************************************************/

    /**
     * Returns the height of the BST (for debugging).
     * @return the height of the BST (a 1-node tree has height 0)
     */
    int height() {
        return height(root);
    }
    int height(Node* x) {
        if (x == nullptr) return -1;
        return 1 + std::max(height(x->left), height(x->right));
    }

    /***************************************************************************
     *  Ordered symbol table methods.
     ***************************************************************************/

    /**
     * Returns the smallest key in the symbol table.
     * @return the smallest key in the symbol table
     * @throws NoSuchElementException if the symbol table is empty
     */
    K* min() {
        if (isEmpty()) return nullptr;
        return min(root).key;
    }

    // the smallest key in subtree rooted at x; null if no such key
    Node* min(Node* x) {
        // assert x != null;
        if (x->left == nullptr) return x;
        else                return min(x->left);
    }

    /**
     * Returns the largest key in the symbol table.
     * @return the largest key in the symbol table
     * @throws NoSuchElementException if the symbol table is empty
     */
    K* max() {
        if (isEmpty()) return nullptr;
        return max(root).key;
    }

    // the largest key in the subtree rooted at x; null if no such key
    Node* max(Node* x) {
        // assert x != null;
        if (x->right == nullptr) return x;
        else                 return max(x->right);
    }


    /**
     * Returns the largest key in the symbol table less than or equal to {@code key}.
     * @param key the key
     * @return the largest key in the symbol table less than or equal to {@code key}
     * @throws NoSuchElementException if there is no such key
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    K* floor(const K& key) {
        if (key == nullptr) return nullptr;
        if (isEmpty()) return nullptr;
        Node* x = floor(root, key);
        if (x == nullptr) return nullptr;
        else           return x->key;
    }

    // the largest key in the subtree rooted at x less than or equal to the given key
    Node* floor(Node* x, const K& key) {
        if (x == nullptr) return nullptr;
        if (key == x->key) return x;
        if (key < x->key)  return floor(x->left, key);
        Node* t = floor(x->right, key);
        if (t != nullptr) return t;
        else           return x;
    }

    /**
     * Returns the smallest key in the symbol table greater than or equal to {@code key}.
     * @param key the key
     * @return the smallest key in the symbol table greater than or equal to {@code key}
     * @throws NoSuchElementException if there is no such key
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    K* ceiling(const K& key) {
        if (key == nullptr) return nullptr;
        if (isEmpty()) return nullptr;
        Node* x = ceiling(root, key);
        if (x == nullptr) return nullptr;
        else           return x->key;
    }

    // the smallest key in the subtree rooted at x greater than or equal to the given key
    Node* ceiling(Node* x, const K& key) {
        if (x == nullptr) return nullptr;
        if (key == x->key) return x;
        if (x->key < key)  return ceiling(x->right, key);
        Node* t = ceiling(x->left, key);
        if (t != nullptr) return t;
        else           return x;
    }

    /**
     * Return the kth smallest key in the symbol table.
     * @param k the order statistic
     * @return the {@code k}th smallest key in the symbol table
     * @throws IllegalArgumentException unless {@code k} is between 0 and
     *     <em>n</em>1
     */
    K* select(int k) {
        if (k < 0 || k >= size()) {
            return nullptr;
        }
        Node x = select(root, k);
        return x->key;
    }

    // the key of rank k in the subtree rooted at x
    Node* select(Node* x, int k) {
        // assert x != null;
        // assert k >= 0 && k < size(x);
        int t = size(x->left);
        if      (t > k) return select(x->left,  k);
        else if (t < k) return select(x->right, k-t-1);
        else            return x;
    }

    /**
     * Return the number of keys in the symbol table strictly less than {@code key}.
     * @param key the key
     * @return the number of keys in the symbol table strictly less than {@code key}
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    int rank(const K& key) {
        if (key == nullptr) return -1;
        return rank(key, root);
    }

    // number of keys less than key in the subtree rooted at x
    int rank(const K& key, Node* x) {
        if (x == nullptr) return 0;
        if      (key < x->key) return rank(key, x->left);
        else if (x->key < key) return 1 + size(x->left) + rank(key, x->right);
        else              return size(x->left);
    }

    /***************************************************************************
     *  Range count and range search->
     ***************************************************************************/


    /**
     * Returns the number of keys in the symbol table in the given range.
     *
     * @param  lo minimum endpoint
     * @param  hi maximum endpoint
     * @return the number of keys in the sybol table between {@code lo}
     *    (inclusive) and {@code hi} (inclusive)
     * @throws IllegalArgumentException if either {@code lo} or {@code hi}
     *    is {@code null}
     */
    int size(const K& lo, const K& hi) {
        if (lo == nullptr) return 0;
        if (hi == nullptr) return 0;

        if (hi < lo) return 0;
        if (containsKey(hi)) return rank(hi) - rank(lo) + 1;
        else              return rank(hi) - rank(lo);
    }


    /***************************************************************************
     *  Check integrity of red-black tree data structure.
     ***************************************************************************/
    bool check() {
        if (!isBST())            std::cout << "Not in symmetric order\n";
        if (!isSizeConsistent()) std::cout << "Subtree counts not consistent\n";
        //if (!isRankConsistent()) std::cout << "Ranks not consistent\n";
        if (!is23())             std::cout << "Not a 2-3 tree\n";
        if (!isBalanced())       std::cout << "Not balanced\n";
        return isBST() && isSizeConsistent() && is23() && isBalanced();
    }

    // does this binary tree satisfy symmetric order?
    // Note: this test also ensures that data structure is a binary tree since order is strict
    bool isBST() {
        return isBST(root, nullptr, nullptr);
    }

    // is the tree rooted at x a BST with all keys strictly between min and max
    // (if min or max is null, treat as empty constraint)
    // Credit: Bob Dondero's elegant solution
    bool isBST(Node* x, K* min, K* max) {
        if (x == nullptr) return true;
        // TODO: port these two lines
        //if (min != nullptr && x->key.compareTo(min) <= 0) return false;
        //if (max != nullptr && x->key.compareTo(max) >= 0) return false;
        return isBST(x->left, min, x->key) && isBST(x->right, x->key, max);
    }

    // are the size fields correct?
    bool isSizeConsistent() { return isSizeConsistent(root); }
    bool isSizeConsistent(Node* x) {
        if (x == nullptr) return true;
        if (x->size != size(x->left) + size(x->right) + 1) return false;
        return isSizeConsistent(x->left) && isSizeConsistent(x->right);
    }

    /*
    // check that ranks are consistent
    bool isRankConsistent() {
        for (int i = 0; i < size(); i++)
            if (i != rank(select(i))) return false;
        for (K* key : keys())
            if (key.compareTo(select(rank(key))) != 0) return false;
        return true;
    }
    */

    // Does the tree have no red right links, and at most one (left)
    // red links in a row on any path?
    bool is23() { return is23(root); }
    bool is23(Node* x) {
        if (x == nullptr) return true;
        if (isRed(x->right)) return false;
        if (x != root && isRed(x) && isRed(x->left))
            return false;
        return is23(x->left) && is23(x->right);
    }

    // do all paths from root to leaf have same number of black edges?
    bool isBalanced() {
        int black = 0;     // number of black links on path from root to min
        Node x = root;
        while (x != nullptr) {
            if (!isRed(x)) black++;
            x = x->left;
        }
        return isBalanced(root, black);
    }

    // does every path from the root to a leaf have the given number of black links?
    bool isBalanced(Node* x, int black) {
        if (x == nullptr) return black == 0;
        if (!isRed(x)) black--;
        return isBalanced(x->left, black) && isBalanced(x->right, black);
    }


    // Inserts a key only if it's not already present
    bool add(K key, const int tid=0) {
        return TM::template updateTx<bool>([=] () {
            return innerPut(key,key);
        });
    }

    // Returns true only if the key was present
    bool remove(K key, const int tid=0) {
        return TM::template updateTx<bool>([=] () {
            V notused;
            bool retval = innerGet(key,notused,false);
            if (retval) innerRemove(key);
            return retval;
        });
    }

    bool contains(K key, const int tid=0) {
        return TM::template readTx<bool>([=] () {
            V notused;
            return innerGet(key,notused,false);
        });
    }

    void addAll(K** keys, int size, const int tid=0) {
        for (int i = 0; i < size; i++) add(*keys[i], tid);
    }

    static std::string className() { return TM::className() + "-RedBlackTree"; }

};

#endif   // _PERSISTENT_TM_RED_BLACK_BST_H_


================================================
FILE: pdatastructures/TMRedBlackTreeByRef.hpp
================================================
#ifndef _PERSISTENT_TM_RED_BLACK_TREEBYREF_H_
#define _PERSISTENT_TM_RED_BLACK_TREEBYREF_H_

#include <cassert>
#include <stdexcept>
#include <algorithm>


// Adapted from Java to C++ from the original at http://algs4.cs.princeton.edu/code/edu/princeton/cs/algs4/RedBlackBST.java
template<typename K, typename V, typename TM, template <typename> class TMTYPE>
class TMRedBlackTreeByRef {
    const int64_t COLOR_RED   = 0;
    const int64_t COLOR_BLACK = 1;

    struct Node {
        TMTYPE<K>       key;
        TMTYPE<V>       val;
        TMTYPE<Node*>   left {nullptr};
        TMTYPE<Node*>   right {nullptr};
        TMTYPE<int64_t> color;    // color of parent link
        TMTYPE<int64_t> size;     // subtree count
        Node(const K& key, const V& val, int64_t color, int64_t size) : key{key}, val{val}, color{color}, size{size} {}
        Node() {}
    };

    TMTYPE<Node*> root {nullptr};   // root of the BST

    inline void assignAndFreeIfNull(TMTYPE<Node*>& z, Node* w) {
        Node* tofree = z;
        z = w;
        if (w == nullptr) TM::tmDelete(tofree);
    }

public:
    /**
     * Initializes an empty symbol table.
     */
    TMRedBlackTreeByRef(int numThreads=0){ }

    ~TMRedBlackTreeByRef() {
        // The transaction log is not enough to delete everything if there are too many, so we delete 1000 per transaction
        for (int i = 0; i < 1000; i++) {
            TM::template updateTx([=] () {
                if (root == nullptr) return;
                deleteMin();
            });
        }
    }

    /***************************************************************************
     *  Node helper methods.
     ***************************************************************************/
    // is node x red; false if x is null ?
    bool isRed(Node* x) {
        if (x == nullptr) return false;
        return x->color == COLOR_RED;
    }

    // number of node in subtree rooted at x; 0 if x is null
    int size(Node* x) {
        if (x == nullptr) return 0;
        return x->size;
    }


    /**
     * Returns the number of key-value pairs in this symbol table.
     * @return the number of key-value pairs in this symbol table
     */
    int size() {
        return size(root);
    }

    /**
     * Is this symbol table empty?
     * @return {@code true} if this symbol table is empty and {@code false} otherwise
     */
    bool isEmpty() {
        return root == nullptr;
    }


    /***************************************************************************
     *  Standard BST search->
     ***************************************************************************/

    /**
     * Returns the value associated with the given key.
     * @param key the key
     * @return the value associated with the given key if the key is in the symbol table
     *     and {@code null} if the key is not in the symbol table
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    bool innerGet(K key, V& oldValue, const bool saveOldValue) {
        bool found = get(root, key);
        if (!found) return false;
        //if (saveOldValue) oldValue = *val; // Copy of V
        return true;
    }

    // value associated with the given key in subtree rooted at x; null if no such key
    bool get(Node* x, K& key) {
        while (x != nullptr) {
            if      (key < x->key) x = x->left;
            else if (x->key < key) x = x->right;
            else              return true;
        }
        return false;
    }

    /**
     * Does this symbol table contain the given key?
     * @param key the key
     * @return {@code true} if this symbol table contains {@code key} and
     *     {@code false} otherwise
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    bool containsKey(const K& key) {
        return get(key) != nullptr;
    }

    /***************************************************************************
     *  Red-black tree insertion.
     ***************************************************************************/

    /**
     * Inserts the specified key-value pair into the symbol table, overwriting the old
     * value with the new value if the symbol table already contains the specified key.
     * Deletes the specified key (and its associated value) from this symbol table
     * if the specified value is {@code null}.
     *
     * @param key the key
     * @param val the value
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    bool innerPut(const K& key, const V& value) {
    	bool ret = false;
        root = put(root, key, value, ret);
        root->color = COLOR_BLACK;
        return ret;
    }

    // insert the key-value pair in the subtree rooted at h
    Node* put(Node* h, const K& key, const V& val, bool& ret) {
        if (h == nullptr) {
            ret = true;
            return TM::template tmNew<Node>(key, val, COLOR_RED, 1);
            //Node* newNode = TM::template tmNew<Node>();
            //newNode->key = key;
            //newNode->val = val;
            //newNode->left = nullptr;
            //newNode->right = nullptr;
            //newNode->color = COLOR_RED;
            //newNode->size = 1;
            //return newNode;
        }
        if      (key < h->key) h->left  = put(h->left,  key, val, ret);
        else if (h->key < key) h->right = put(h->right, key, val, ret);
        else              h->val   = val;
        // fix-up any right-leaning links
        if (isRed(h->right) && !isRed(h->left))       h = rotateLeft(h);
        if (isRed(h->left)  &&  isRed(h->left->left)) h = rotateRight(h);
        if (isRed(h->left)  &&  isRed(h->right))      flipColors(h);
        h->size = size(h->left) + size(h->right) + 1;

        return h;
    }

    /***************************************************************************
     *  Red-black tree deletion.
     ***************************************************************************/

    /**
     * Removes the smallest key and associated value from the symbol table.
     * @throws NoSuchElementException if the symbol table is empty
     */
    void deleteMin() {
        if (isEmpty()) return;
        // if both children of root are black, set root to red
        if (!isRed(root->left) && !isRed(root->right))
            root->color = COLOR_RED;
        assignAndFreeIfNull(root, deleteMin(root));
        if (!isEmpty()) root->color = COLOR_BLACK;
        // assert check();
    }

    // delete the key-value pair with the minimum key rooted at h
    Node* deleteMin(Node* h) {
        if (h->left == nullptr)
            return nullptr;
        if (!isRed(h->left) && !isRed(h->left->left))
            h = moveRedLeft(h);
        assignAndFreeIfNull(h->left, deleteMin(h->left));
        return balance(h);
    }


    /**
     * Removes the largest key and associated value from the symbol table.
     * @throws NoSuchElementException if the symbol table is empty
     */
    void deleteMax() {
        if (isEmpty()) return;

        // if both children of root are black, set root to red
        if (!isRed(root->left) && !isRed(root->right))
            root->color = COLOR_RED;

        root = deleteMax(root);
        if (!isEmpty()) root->color = COLOR_BLACK;
        // assert check();
    }

    // delete the key-value pair with the maximum key rooted at h
    Node* deleteMax(Node* h) {
        if (isRed(h->left))
            h = rotateRight(h);

        if (h->right == nullptr)
            return nullptr;

        if (!isRed(h->right) && !isRed(h->right->left))
            h = moveRedRight(h);

        h->right = deleteMax(h->right);

        return balance(h);
    }

    /**
     * Removes the specified key and its associated value from this symbol table
     * (if the key is in this symbol table).
     *
     * @param  key the key
     */
    void innerRemove(K key) {
        // if both children of root are black, set root to red
        if (!isRed(root->left) && !isRed(root->right)) root->color = COLOR_RED;
        assignAndFreeIfNull(root, deleteKey(root, key));
        if (!isEmpty()) root->color = COLOR_BLACK;
        // assert check();
    }

    // delete the key-value pair with the given key rooted at h
    Node* deleteKey(Node* h, const K& key) {
        // assert get(h, key) != null;
        if (key < h->key)  {
            if (!isRed(h->left) && !isRed(h->left->left)) {
                h = moveRedLeft(h);
            }
            assignAndFreeIfNull(h->left, deleteKey(h->left, key));
        } else {
            if (isRed(h->left)) {
                h = rotateRight(h);
            }
            if (key == h->key && (h->right == nullptr)) {
                return nullptr;
            }
            if (!isRed(h->right) && !isRed(h->right->left)) {
                h = moveRedRight(h);
            }
            if (key == h->key) {
                Node* x = min(h->right);
                h->key = x->key;
                h->val = x->val;
                // h->val = get(h->right, min(h->right).key);
                // h->key = min(h->right).key;
                assignAndFreeIfNull(h->right, deleteMin(h->right));
            } else {
                assignAndFreeIfNull(h->right, deleteKey(h->right, key));
            }
        }
        return balance(h);
    }

    /***************************************************************************
     *  Red-black tree helper functions.
     ***************************************************************************/

    // make a left-leaning link lean to the right
    Node* rotateRight(Node* h) {
        // assert (h != null) && isRed(h->left);
        Node* x = h->left;
        h->left = x->right;
        x->right = h;
        x->color = x->right->color;
        x->right->color = COLOR_RED;
        x->size = h->size;
        h->size = size(h->left) + size(h->right) + 1;
        return x;
    }

    // make a right-leaning link lean to the left
    Node* rotateLeft(Node* h) {
        // assert (h != null) && isRed(h->right);
        Node* x = h->right;
        h->right = x->left;
        x->left = h;
        x->color = x->left->color;
        x->left->color = COLOR_RED;
        x->size = h->size;
        h->size = size(h->left) + size(h->right) + 1;
        return x;
    }

    // flip the colors of a node and its two children
    void flipColors(Node* h) {
        // h must have opposite color of its two children
        // assert (h != null) && (h->left != null) && (h->right != null);
        // assert (!isRed(h) &&  isRed(h->left) &&  isRed(h->right))
        //    || (isRed(h)  && !isRed(h->left) && !isRed(h->right));
        h->color = !h->color;
        h->left->color = !h->left->color;
        h->right->color = !h->right->color;
    }

    // Assuming that h is red and both h->left and h->left.left
    // are black, make h->left or one of its children red.
    Node* moveRedLeft(Node* h) {
        // assert (h != null);
        // assert isRed(h) && !isRed(h->left) && !isRed(h->left.left);

        flipColors(h);
        if (isRed(h->right->left)) {
            h->right = rotateRight(h->right);
            h = rotateLeft(h);
            flipColors(h);
        }
        return h;
    }

    // Assuming that h is red and both h->right and h->right.left
    // are black, make h->right or one of its children red.
    Node* moveRedRight(Node* h) {
        // assert (h != null);
        // assert isRed(h) && !isRed(h->right) && !isRed(h->right.left);
        flipColors(h);
        if (isRed(h->left->left)) {
            h = rotateRight(h);
            flipColors(h);
        }
        return h;
    }

    // restore red-black tree invariant
    Node* balance(Node* h) {
        // assert (h != null);

        if (isRed(h->right))                        h = rotateLeft(h);
        if (isRed(h->left) && isRed(h->left->left)) h = rotateRight(h);
        if (isRed(h->left) && isRed(h->right))      flipColors(h);

        h->size = size(h->left) + size(h->right) + 1;
        return h;
    }


    /***************************************************************************
     *  Utility functions.
     ***************************************************************************/

    /**
     * Returns the height of the BST (for debugging).
     * @return the height of the BST (a 1-node tree has height 0)
     */
    int height() {
        return height(root);
    }
    int height(Node* x) {
        if (x == nullptr) return -1;
        return 1 + std::max(height(x->left), height(x->right));
    }

    /***************************************************************************
     *  Ordered symbol table methods.
     ***************************************************************************/

    /**
     * Returns the smallest key in the symbol table.
     * @return the smallest key in the symbol table
     * @throws NoSuchElementException if the symbol table is empty
     */
    K* min() {
        if (isEmpty()) return nullptr;
        return min(root).key;
    }

    // the smallest key in subtree rooted at x; null if no such key
    Node* min(Node* x) {
        // assert x != null;
        if (x->left == nullptr) return x;
        else                return min(x->left);
    }

    /**
     * Returns the largest key in the symbol table.
     * @return the largest key in the symbol table
     * @throws NoSuchElementException if the symbol table is empty
     */
    K* max() {
        if (isEmpty()) return nullptr;
        return max(root).key;
    }

    // the largest key in the subtree rooted at x; null if no such key
    Node* max(Node* x) {
        // assert x != null;
        if (x->right == nullptr) return x;
        else                 return max(x->right);
    }


    /**
     * Returns the largest key in the symbol table less than or equal to {@code key}.
     * @param key the key
     * @return the largest key in the symbol table less than or equal to {@code key}
     * @throws NoSuchElementException if there is no such key
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    K* floor(const K& key) {
        if (key == nullptr) return nullptr;
        if (isEmpty()) return nullptr;
        Node* x = floor(root, key);
        if (x == nullptr) return nullptr;
        else           return x->key;
    }

    // the largest key in the subtree rooted at x less than or equal to the given key
    Node* floor(Node* x, const K& key) {
        if (x == nullptr) return nullptr;
        if (key == x->key) return x;
        if (key < x->key)  return floor(x->left, key);
        Node* t = floor(x->right, key);
        if (t != nullptr) return t;
        else           return x;
    }

    /**
     * Returns the smallest key in the symbol table greater than or equal to {@code key}.
     * @param key the key
     * @return the smallest key in the symbol table greater than or equal to {@code key}
     * @throws NoSuchElementException if there is no such key
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    K* ceiling(const K& key) {
        if (key == nullptr) return nullptr;
        if (isEmpty()) return nullptr;
        Node* x = ceiling(root, key);
        if (x == nullptr) return nullptr;
        else           return x->key;
    }

    // the smallest key in the subtree rooted at x greater than or equal to the given key
    Node* ceiling(Node* x, const K& key) {
        if (x == nullptr) return nullptr;
        if (key == x->key) return x;
        if (x->key < key)  return ceiling(x->right, key);
        Node* t = ceiling(x->left, key);
        if (t != nullptr) return t;
        else           return x;
    }

    /**
     * Return the kth smallest key in the symbol table.
     * @param k the order statistic
     * @return the {@code k}th smallest key in the symbol table
     * @throws IllegalArgumentException unless {@code k} is between 0 and
     *     <em>n</em>1
     */
    K* select(int k) {
        if (k < 0 || k >= size()) {
            return nullptr;
        }
        Node x = select(root, k);
        return x->key;
    }

    // the key of rank k in the subtree rooted at x
    Node* select(Node* x, int k) {
        // assert x != null;
        // assert k >= 0 && k < size(x);
        int t = size(x->left);
        if      (t > k) return select(x->left,  k);
        else if (t < k) return select(x->right, k-t-1);
        else            return x;
    }

    /**
     * Return the number of keys in the symbol table strictly less than {@code key}.
     * @param key the key
     * @return the number of keys in the symbol table strictly less than {@code key}
     * @throws IllegalArgumentException if {@code key} is {@code null}
     */
    int rank(const K& key) {
        if (key == nullptr) return -1;
        return rank(key, root);
    }

    // number of keys less than key in the subtree rooted at x
    int rank(const K& key, Node* x) {
        if (x == nullptr) return 0;
        if      (key < x->key) return rank(key, x->left);
        else if (x->key < key) return 1 + size(x->left) + rank(key, x->right);
        else              return size(x->left);
    }

    /***************************************************************************
     *  Range count and range search->
     ***************************************************************************/


    /**
     * Returns the number of keys in the symbol table in the given range.
     *
     * @param  lo minimum endpoint
     * @param  hi maximum endpoint
     * @return the number of keys in the sybol table between {@code lo}
     *    (inclusive) and {@code hi} (inclusive)
     * @throws IllegalArgumentException if either {@code lo} or {@code hi}
     *    is {@code null}
     */
    int size(const K& lo, const K& hi) {
        if (lo == nullptr) return 0;
        if (hi == nullptr) return 0;

        if (hi < lo) return 0;
        if (containsKey(hi)) return rank(hi) - rank(lo) + 1;
        else              return rank(hi) - rank(lo);
    }


    /***************************************************************************
     *  Check integrity of red-black tree data structure.
     ***************************************************************************/
    bool check() {
        if (!isBST())            std::cout << "Not in symmetric order\n";
        if (!isSizeConsistent()) std::cout << "Subtree counts not consistent\n";
        //if (!isRankConsistent()) std::cout << "Ranks not consistent\n";
        if (!is23())             std::cout << "Not a 2-3 tree\n";
        if (!isBalanced())       std::cout << "Not balanced\n";
        return isBST() && isSizeConsistent() && is23() && isBalanced();
    }

    // does this binary tree satisfy symmetric order?
    // Note: this test also ensures that data structure is a binary tree since order is strict
    bool isBST() {
        return isBST(root, nullptr, nullptr);
    }

    // is the tree rooted at x a BST with all keys strictly between min and max
    // (if min or max is null, treat as empty constraint)
    // Credit: Bob Dondero's elegant solution
    bool isBST(Node* x, K* min, K* max) {
        if (x == nullptr) return true;
        // TODO: port these two lines
        //if (min != nullptr && x->key.compareTo(min) <= 0) return false;
        //if (max != nullptr && x->key.compareTo(max) >= 0) return false;
        return isBST(x->left, min, x->key) && isBST(x->right, x->key, max);
    }

    // are the size fields correct?
    bool isSizeConsistent() { return isSizeConsistent(root); }
    bool isSizeConsistent(Node* x) {
        if (x == nullptr) return true;
        if (x->size != size(x->left) + size(x->right) + 1) return false;
        return isSizeConsistent(x->left) && isSizeConsistent(x->right);
    }

    /*
    // check that ranks are consistent
    bool isRankConsistent() {
        for (int i = 0; i < size(); i++)
            if (i != rank(select(i))) return false;
        for (K* key : keys())
            if (key.compareTo(select(rank(key))) != 0) return false;
        return true;
    }
    */

    // Does the tree have no red right links, and at most one (left)
    // red links in a row on any path?
    bool is23() { return is23(root); }
    bool is23(Node* x) {
        if (x == nullptr) return true;
        if (isRed(x->right)) return false;
        if (x != root && isRed(x) && isRed(x->left))
            return false;
        return is23(x->left) && is23(x->right);
    }

    // do all paths from root to leaf have same number of black edges?
    bool isBalanced() {
        int black = 0;     // number of black links on path from root to min
        Node x = root;
        while (x != nullptr) {
            if (!isRed(x)) black++;
            x = x->left;
        }
        return isBalanced(root, black);
    }

    // does every path from the root to a leaf have the given number of black links?
    bool isBalanced(Node* x, int black) {
        if (x == nullptr) return black == 0;
        if (!isRed(x)) black--;
        return isBalanced(x->left, black) && isBalanced(x->right, black);
    }


    // Inserts a key only if it's not already present
    bool add(K key, const int tid=0) {
        bool retval = false;
        TM::template updateTx<bool>([&] () {
        	retval = innerPut(key,key);
        });
        return retval;
    }

    // Returns true only if the key was present
    bool remove(K key, const int tid=0) {
        bool retval = false;
        TM::template updateTx<bool>([&] () {
            V notused;
            retval = innerGet(key,notused,false);
            if (retval) innerRemove(key);
        });
        return retval;
    }

    bool contains(K key, const int tid=0) {
        bool retval = false;
        TM::template readTx<bool>([&] () {
            V notused;
            retval = innerGet(key,notused,false);
        });
        return retval;
    }

    void addAll(K** keys, int size, const int tid=0) {
        for (int i = 0; i < size; i++) add(*keys[i], tid);
    }

    static std::string className() { return TM::className() + "-RedBlackTree"; }

};

#endif   // _PERSISTENT_TM_RED_BLACK_TREEBYREF_H_


================================================
FILE: pdatastructures/pqueues/HazardPointers.hpp
================================================
/******************************************************************************
 * Copyright (c) 2014-2016, Pedro Ramalhete, Andreia Correia
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of Concurrency Freaks nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.

 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************************************************************
 */

#ifndef _HAZARD_POINTERS_H_
#define _HAZARD_POINTERS_H_

#include <atomic>
#include <iostream>
#include <functional>
#include <vector>


template<typename T>
class HazardPointers {

private:
    static const int      HP_MAX_THREADS = 128;
    static const int      HP_MAX_HPS = 128;     // This is named 'K' in the HP paper
    static const int      CLPAD = 128/sizeof(std::atomic<T*>);
    static const int      HP_THRESHOLD_R = 0; // This is named 'R' in the HP paper
    static const int      MAX_RETIRED = HP_MAX_THREADS*HP_MAX_HPS; // Maximum number of retired objects per thread

    const int             maxHPs;
    const int             maxThreads;

    alignas(128) std::atomic<T*>*      hp[HP_MAX_THREADS];
    // It's not nice that we have a lot of empty vectors, but we need padding to avoid false sharing
    alignas(128) std::vector<T*>       retiredList[HP_MAX_THREADS*CLPAD];
    std::function<void(T*,int)> defdeleter = [](T* t, int tid){ delete t; };
    std::function<void(T*,int)>& deleter;
public:

    HazardPointers(int maxHPs, int maxThreads) : maxHPs{maxHPs}, maxThreads{maxThreads}, deleter{defdeleter} {
        for (int ithread = 0; ithread < HP_MAX_THREADS; ithread++) {
            hp[ithread] = new std::atomic<T*>[HP_MAX_HPS];
            for (int ihp = 0; ihp < HP_MAX_HPS; ihp++) {
                hp[ithread][ihp].store(nullptr, std::memory_order_relaxed);
            }
        }
    }

    HazardPointers(int maxHPs, int maxThreads, std::function<void(T*,int)>& deleter) : maxHPs{maxHPs}, maxThreads{maxThreads}, deleter{deleter} {
        for (int ithread = 0; ithread < HP_MAX_THREADS; ithread++) {
            hp[ithread] = new std::atomic<T*>[HP_MAX_HPS];
            for (int ihp = 0; ihp < HP_MAX_HPS; ihp++) {
                hp[ithread][ihp].store(nullptr, std::memory_order_relaxed);
            }
        }
    }

    ~HazardPointers() {
        for (int ithread = 0; ithread < HP_MAX_THREADS; ithread++) {
            delete[] hp[ithread];
            // Clear the current retired nodes
            for (unsigned iret = 0; iret < retiredList[ithread*CLPAD].size(); iret++) {
                delete retiredList[ithread*CLPAD][iret];
            }
        }
    }


    /**
     * Progress Condition: wait-free bounded (by maxHPs)
     */
    void clear(const int tid) {
        for (int ihp = 0; ihp < maxHPs; ihp++) {
            hp[tid][ihp].store(nullptr, std::memory_order_release);
        }
    }


    /**
     * Progress Condition: wait-free population oblivious
     */
    void clearOne(int ihp, const int tid) {
        hp[tid][ihp].store(nullptr, std::memory_order_release);
    }


    /**
     * Progress Condition: lock-free
     */
    T* protect(int index, const std::atomic<T*>& atom, const int tid) {
        T* n = nullptr;
        T* ret;
		while ((ret = atom.load()) != n) {
			hp[tid][index].store(ret);
			n = ret;
		}
		return ret;
    }

    T* get(int index, const int tid){
        return hp[tid][index].load();
    }
    /**
     * This returns the same value that is passed as ptr, which is sometimes useful
     * Progress Condition: wait-free population oblivious
     */
    T* protectPtr(int index, T* ptr, const int tid) {
        hp[tid][index].store(ptr);
        return ptr;
    }


    /**
     * This returns the same value that is passed as ptr, which is sometimes useful
     * Progress Condition: wait-free population oblivious
     */
    T* protectPtrRelease(int index, T* ptr, const int tid) {
        hp[tid][index].store(ptr, std::memory_order_release);
        return ptr;
    }


    /**
     * Progress Condition: wait-free bounded (by the number of threads squared)
     */
    void retire(T* ptr, const int tid) {
        retiredList[tid*CLPAD].push_back(ptr);
        if (retiredList[tid*CLPAD].size() < HP_THRESHOLD_R) return;
        for (unsigned iret = 0; iret < retiredList[tid*CLPAD].size();) {
            auto obj = retiredList[tid*CLPAD][iret];
            bool canDelete = true;
            for (int tid = 0; tid < maxThreads && canDelete; tid++) {
                for (int ihp = maxHPs-1; ihp >= 0; ihp--) {
                    if (hp[tid][ihp].load() == obj) {
                        canDelete = false;
                        break;
                    }
                }
            }
            if (canDelete) {
                retiredList[tid*CLPAD].erase(retiredList[tid*CLPAD].begin() + iret);
                deleter(obj,tid);
                continue;
            }
            iret++;
        }
    }
};

#endif /* _HAZARD_POINTERS_H_ */


================================================
FILE: pdatastructures/pqueues/MichaelScottQueue.hpp
================================================
/******************************************************************************
 * Copyright (c) 2014-2016, Pedro Ramalhete, Andreia Correia
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of Concurrency Freaks nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************************************************************
 */

#ifndef _MICHAEL_SCOTT_QUEUE_HP_H_
#define _MICHAEL_SCOTT_QUEUE_HP_H_

#include <atomic>
#include <stdexcept>
#include "HazardPointers.hpp"


/**
 * <h1> Michael-Scott Queue </h1>
 *
 * enqueue algorithm: MS enqueue
 * dequeue algorithm: MS dequeue
 * Consistency: Linearizable
 * enqueue() progress: lock-free
 * dequeue() progress: lock-free
 * Memory Reclamation: Hazard Pointers (lock-free)
 *
 *
 * Maged Michael and Michael Scott's Queue with Hazard Pointers
 * <p>
 * Lock-Free Linked List as described in Maged Michael and Michael Scott's paper:
 * {@link http://www.cs.rochester.edu/~scott/papers/1996_PODC_queues.pdf}
 * <a href="http://www.cs.rochester.edu/~scott/papers/1996_PODC_queues.pdf">
 * Simple, Fast, and Practical Non-Blocking and Blocking Concurrent Queue Algorithms</a>
 * <p>
 * The paper on Hazard Pointers is named "Hazard Pointers: Safe Memory
 * Reclamation for Lock-Free objects" and it is available here:
 * http://web.cecs.pdx.edu/~walpole/class/cs510/papers/11.pdf
 *
 */
template<typename T>
class MichaelScottQueue {

private:
    struct Node {
        T item;
        std::atomic<Node*> next;

        Node(T userItem) : item{userItem}, next{nullptr} { }

        bool casNext(Node *cmp, Node *val) {
            return next.compare_exchange_strong(cmp, val);
        }
    };

    bool casTail(Node *cmp, Node *val) {
		return tail.compare_exchange_strong(cmp, val);
	}

    bool casHead(Node *cmp, Node *val) {
        return head.compare_exchange_strong(cmp, val);
    }

    // Pointers to head and tail of the list
    alignas(128) std::atomic<Node*> head;
    alignas(128) std::atomic<Node*> tail;

    static const int MAX_THREADS = 128;
    const int maxThreads;

    // We need two hazard pointers for dequeue()
    HazardPointers<Node> hp {2, maxThreads};
    const int kHpTail = 0;
    const int kHpHead = 0;
    const int kHpNext = 1;

public:
    T EMPTY {};

    MichaelScottQueue(int maxThreads=MAX_THREADS) : maxThreads{maxThreads} {
        Node* sentinelNode = new Node(EMPTY);
        head.store(sentinelNode, std::memory_order_relaxed);
        tail.store(sentinelNode, std::memory_order_relaxed);
    }


    ~MichaelScottQueue() {
        while (dequeue(0) != EMPTY);   // Drain the queue
        delete head.load();            // Delete the last node
    }

    static std::string className() { return "MichaelScottQueue"; }

    void enqueue(T item, const int tid) {
        if (item == EMPTY) throw std::invalid_argument("item can not be nullptr");
        Node* newNode = new Node(item);
        while (true) {
            Node* ltail = hp.protectPtr(kHpTail, tail, tid);
            if (ltail == tail.load()) {
                Node* lnext = ltail->next.load();
                if (lnext == nullptr) {
                    // It seems this is the last node, so add the newNode here
                    // and try to move the tail to the newNode
                    if (ltail->casNext(nullptr, newNode)) {
                        casTail(ltail, newNode);
                        hp.clear(tid);
                        return;
                    }
                } else {
                    casTail(ltail, lnext);
                }
            }
        }
    }


    T dequeue(const int tid) {
        Node* node = hp.protect(kHpHead, head, tid);
        while (node != tail.load()) {
            Node* lnext = hp.protect(kHpNext, node->next, tid);
            if (casHead(node, lnext)) {
                T item = lnext->item;  // Another thread may clean up lnext after we do hp.clear()
                hp.clear(tid);
                hp.retire(node, tid);
                return item;
            }
            node = hp.protect(kHpHead, head, tid);
        }
        hp.clear(tid);
        return EMPTY;                  // Queue is empty
    }
};

#endif /* _MICHAEL_SCOTT_QUEUE_HP_H_ */


================================================
FILE: pdatastructures/pqueues/PFriedmanQueue.hpp
================================================
/******************************************************************************
 * Copyright (c) 2018, Pedro Ramalhete, Andreia Correia
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of Concurrency Freaks nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************************************************************
 */

#ifndef _PERSISTENT_FRIEDMAN_QUEUE_HP_H_
#define _PERSISTENT_FRIEDMAN_QUEUE_HP_H_

#include <atomic>
#include <stdexcept>
#include "common/pfences.h"
#include "HazardPointers.hpp"

// Comment this define to use the system's new/delete volatile allocator
//#define USE_PMDK_ALLOC

#ifdef USE_PMDK_ALLOC
#include <libpmemobj++/p.hpp>
#include <libpmemobj++/transaction.hpp>
#include <libpmemobj++/pool.hpp>
#include <libpmemobj++/allocator.hpp>
#include <mutex>
using namespace pmem::obj;
auto gpopf = pool_base::create("/dev/shm/pmdk_shared_friedman", "", (size_t)(800*1024*1024));
std::mutex glockf {};
#endif

/**
 * <h1> Persistent lock-free Queue </h1>
 *
 * WARNING: this doesn't do memory reclamation, which means that when we enabled PMDK as the allocator,
 * it blows away all the memory quickly (just 2 threads is enough).
 * I'm not sure how to do proper memory reclamation with this, seen as returnedValues still holds pointers to the items.
 * I guess the only way to do it is to have pointers to copy of the items, and delete/retire the old ones before overwritting.
 * This in turn creates more pressure on the memory allocator... ohh well.
 *
 * This is the lock-free queue shown by Michal Friedman, Maurice Herlihy, Virendra Marathe, Erez Petrank.
 * https://dl.acm.org/citation.cfm?id=3178490
 *
 * Consistency: Durable Linearizable
 * enqueue() progress: lock-free
 * dequeue() progress: lock-free
 * Memory Reclamation: Hazard Pointers (lock-free)
 *
 * To understand what the PWB/PFENCE/PSYNC are, take a look at
 * "Preserving Happens-Before in persistent memory":
 * https://www.cs.rochester.edu/u/jhi1/papers/2016-spaa-transform
 *
 * <p>
 * The paper on Hazard Pointers is named "Hazard Pointers: Safe Memory
 * Reclamation for Lock-Free objects" and it is available here:
 * http://web.cecs.pdx.edu/~walpole/class/cs510/papers/11.pdf
 *
 */
template<typename T>
class PFriedmanQueue {

private:
    static const int MAX_THREADS = 128;

    struct Node {
        T                   value;
        std::atomic<Node*>  next {nullptr};
        std::atomic<int>    deqThreadID {-1};
        Node(T item) : value{item} { }
        bool casNext(Node *cmp, Node *val) {
            return next.compare_exchange_strong(cmp, val);
        }
        bool casDeqTID(int cmp, int val) {
            return deqThreadID.compare_exchange_strong(cmp, val);
        }
    };

    bool casTail(Node *cmp, Node *val) {
		return tail.compare_exchange_strong(cmp, val);
	}

    bool casHead(Node *cmp, Node *val) {
        return head.compare_exchange_strong(cmp, val);
    }

    //
    // Persistent variables
    //

    // Pointers to head and tail of the list
    alignas(128) std::atomic<Node*> head {nullptr};
    alignas(128) std::atomic<Node*> tail {nullptr};
    alignas(128) std::atomic<T*> returnedValues[MAX_THREADS];


    // Set to true when the constructor completed sucessfully
    bool constructorInProgress = true;
    // Will be set to true when the destructor is called, in case there is a crash during destructor
    bool destructorInProgress = false;
    const int maxThreads;

    template<typename TN>
    static void internalDelete(TN* obj) {
#ifdef USE_PMDK_ALLOC
        if (obj == nullptr) return;
        obj->~TN();
        glockf.lock();
        transaction::exec_tx(gpopf, [obj] () {
            pmemobj_tx_free(pmemobj_oid(obj));
        });
        glockf.unlock();
#else
        delete obj;
#endif
    }

    template<typename TN, typename... Args>
    static TN* internalNew(Args&&... args) {
#ifdef USE_PMDK_ALLOC
        glockf.lock();
        void *addr = nullptr;
        transaction::exec_tx(gpopf, [&addr] () {
            auto oid = pmemobj_tx_alloc(sizeof(TN), 0);
            addr = pmemobj_direct(oid);
        });
        glockf.unlock();
        return new (addr) TN(std::forward<Args>(args)...); // placement new
#else
        return new TN(std::forward<Args>(args)...);
#endif
    }

    std::function<void(Node*,int)> mydeleter = [](Node* node, int tid){ internalDelete(node); };

    // We need two hazard pointers for dequeue()
    // This variable is a non-volatile pointer to a volatile object
    HazardPointers<Node>* hp  = new HazardPointers<Node>{2, maxThreads,mydeleter};
    static const int kHpTail = 0;
    static const int kHpHead = 0;
    static const int kHpNext = 1;


    /*
     * To be called when restarting after a failure
     * We tried to follow the description in section 5.3 of the paper as much as possible
     */
    void recover() {
        // TODO: not yet implemented...
        if (destructorInProgress) {
            if (head.load(std::memory_order_relaxed) != nullptr) {
                while (dequeue(0) != EMPTY); // Drain the queue
                head.store(nullptr, std::memory_order_relaxed);
                PWB(&head);
                PFENCE();
                internalDelete(head.load(std::memory_order_relaxed));  // Delete the last node
            }
            PSYNC();
            delete hp;
            return;
        }
        hp = new HazardPointers<Node>{2, maxThreads,mydeleter};

        // TODO: place recovery of head and recovery of tail here...


        // If both head is null then a failure occurred during constructor
        if (head.load(std::memory_order_relaxed) == nullptr) {
            Node* sentinelNode = internalNew<Node>(T{});
            head.store(sentinelNode, std::memory_order_relaxed);
            PWB(&head);
            PFENCE();
        }
        // If tail is null, then fix it by setting it to head
        if (tail.load(std::memory_order_relaxed) == nullptr) {
            tail.store(head.load(std::memory_order_relaxed), std::memory_order_relaxed);
            PWB(&tail);
            PFENCE();
        }
        // Advance the tail if needed
        Node* ltail = tail.load(std::memory_order_relaxed);
        Node* lnext = ltail->next.load(std::memory_order_relaxed);
        if (lnext != nullptr) {
            tail.store(lnext, std::memory_order_relaxed);
            PWB(&tail);
        }
        PSYNC();
    }


public:
    T EMPTY {};

    // This is "DurableQueue()" of Figure 1 of the paper.
    // Unfortunately, this code is incorrect without some kind of "validation" flag, so we added it
    PFriedmanQueue(int maxThreads=MAX_THREADS) : maxThreads{maxThreads} {
        if (!constructorInProgress) return;
        Node* node = internalNew<Node>(T{});
        PWB(node); // We're assuming this flushes the whole node
        PFENCE();
        head = node;
        PWB(&head);
        PFENCE();
        tail = node;
        PWB(&tail);
        PFENCE();
        for (int i = 0; i < maxThreads; i++) {
            returnedValues[i].store(nullptr, std::memory_order_release);
            PWB(&returnedValues[i]);
            PFENCE();
        }
        constructorInProgress = false;
        PWB(&constructorInProgress);
        PFENCE();
    }

    // There is no destructor in the original code, therefore we had to make one
    ~PFriedmanQueue() {
        destructorInProgress = true;
        PWB(&destructorInProgress);
        PFENCE();
        recover();  // Re-using the same code from the recovery method
    }

    static std::string className() { return "PFriedmanQueue"; }

    /*
     * Code taken from enq() in figure 2 of the paper.
     * Progress: lock-free
     * Uncontended: 2 PWB, 2 PFENCE, 2 CAS,
     */
    void enqueue(T item, const int tid) {
        Node* node = internalNew<Node>(item);
        PWB(&node->value); // We flush multiple variables, just in case they are not on the same cache line
        PWB(&node->next);
        PFENCE();             // This isn't really needed, but it's in the paper so we leave it
        while (true) {
            Node* last = hp->protectPtr(kHpTail, tail, tid);
            if (last == tail.load()) {
                Node* next = last->next.load();
                if (next == nullptr) {
                    if (last->casNext(nullptr, node)) {
                        PWB(&last->next);
                        PSYNC(); // This isn't really needed because of the following CAS, but it's in the paper
                        casTail(last, node);
                        hp->clear(tid);
                        return;
                    }
                } else {
                    PWB(&last->next);
                    PSYNC(); // This isn't really needed because of the following CAS, but it's in the paper
                    casTail(last, next);
                }
            }
        }
    }

    /*
     * Code taken from deq() in figure 3 of the paper.
     * Progress: lock-free
     * Uncontended: 4 PWB, 4 PFENCE, 2 CAS, 1 MFENCE (for the seq-cst store in returnedValues[tid])
     */
    T dequeue(const int tid) {
        T* newReturnedValue = internalNew<T>();
        PWB(newReturnedValue); // Flush the contents of T. We're assuming T is on the same cache line
        PFENCE();
        returnedValues[tid] = newReturnedValue;
        PWB(&returnedValues[tid]);
        PFENCE();
        while (true) {
            Node* first = hp->protectPtr(kHpHead, head, tid);
            Node* last = tail;
            if (first == head) {
                Node* next = first->next.load();
                if (first == last) {
                    if (next == nullptr) {
                        *returnedValues[tid] = EMPTY;
                        PWB(returnedValues[tid].load());
                        PSYNC();
                        hp->clear(tid);
                        return EMPTY;
                    }
                    PWB(&last->next);
                    PFENCE();
                    casTail(last, next);
                } else {
                    T value = next->value;
                    if (next->casDeqTID(-1, tid)) {
                        PWB(&(first->next.load()->deqThreadID));
                        PFENCE();
                        *returnedValues[tid] = value;
                        PWB(returnedValues[tid].load());
                        PSYNC();
                        if (casHead(first, next));// hp->retire(first, tid);
                        hp->clear(tid);
                        return *returnedValues[tid];
                    } else {
                        T* address = returnedValues[next->deqThreadID];
                        if (head == first) { //same context
                            PWB(&(first->next.load()->deqThreadID));
                            PFENCE();
                            *address = value;
                            PWB(address);
                            PFENCE();
                            if (casHead(first, next));// hp->retire(first, tid);
                        }
                    }
                }
            }
        }
    }
};

#endif /* _PERSISTENT_FRIEDMAN_QUEUE_HP_H_ */


================================================
FILE: pdatastructures/pqueues/PMDKLinkedListQueue.hpp
================================================
/*
 * Copyright 2017-2019
 *   Andreia Correia <andreia.veiga@unine.ch>
 *   Pedro Ramalhete <pramalhe@gmail.com>
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Nachshon Cohen <nachshonc@gmail.com>
 *
 * This work is published under the MIT license. See LICENSE.txt
 */
#ifndef _PERSISTENT_PMDK_LINKED_LIST_QUEUE_H_
#define _PERSISTENT_PMDK_LINKED_LIST_QUEUE_H_

#include <stdexcept>

#include "ptms/PMDKTM.hpp"

/**
 * <h1> A Linked List queue using PMDK PTM (blocking) </h1>
 */
template<typename T>
class PMDKLinkedListQueue {

private:
    struct Node {
        pmdk::persist<T> item;
        pmdk::persist<Node*> next {nullptr};
        Node(T userItem) : item{userItem} { }
    };

    alignas(128) pmdk::persist<Node*>  head {nullptr};
    alignas(128) pmdk::persist<Node*>  tail {nullptr};


public:
    T EMPTY {};

    PMDKLinkedListQueue(unsigned int maxThreads=0) {
        pmdk::PMDKTM::updateTx([&] () {
            Node* sentinelNode = pmdk::PMDKTM::tmNew<Node>(EMPTY);
            head = sentinelNode;
            tail = sentinelNode;
        });
    }


    ~PMDKLinkedListQueue() {
        pmdk::PMDKTM::updateTx([&] () {
            while (dequeue() != EMPTY); // Drain the queue
            Node* lhead = head;
            pmdk::PMDKTM::tmDelete(lhead);
        });
    }


    static std::string className() { return "PMDK-LinkedListQueue"; }


    /*
     * Progress Condition: blocking
     * Always returns true
     */
    bool enqueue(T item, const int tid=0) {
        if (item == EMPTY) throw std::invalid_argument("item can not be nullptr");
        pmdk::PMDKTM::updateTx([&] () {
            Node* newNode = pmdk::PMDKTM::tmNew<Node>(item);
            tail->next = newNode;
            tail = newNode;
        });
        return true;
    }


    /*
     * Progress Condition: blocking
     */
    T dequeue(const int tid=0) {
        T item = EMPTY;
        pmdk::PMDKTM::updateTx<T*>([&] () {
            Node* lhead = head;
            if (lhead == tail) return;
            head = lhead->next;
            pmdk::PMDKTM::tmDelete(lhead);
            item = head->item;
        });
        return item;
    }
};

#endif /* _PERSISTENT_ROMULUS_LOG_LINKED_LIST_QUEUE_H_ */


================================================
FILE: pdatastructures/pqueues/PMichaelScottQueue.hpp
================================================
/******************************************************************************
 * Copyright (c) 2018, Pedro Ramalhete, Andreia Correia
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of Concurrency Freaks nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************************************************************
 */

#ifndef _PERSISTENT_MICHAEL_SCOTT_QUEUE_HP_H_
#define _PERSISTENT_MICHAEL_SCOTT_QUEUE_HP_H_

#include <atomic>
#include <stdexcept>
#include "common/pfences.h"
#include "HazardPointers.hpp"

// Comment this define to go back to using the regular new/delete volatile allocator
//#define USE_PMDK_ALLOC

#ifdef USE_PMDK_ALLOC
#include <libpmemobj++/p.hpp>
#include <libpmemobj++/transaction.hpp>
#include <libpmemobj++/pool.hpp>
#include <libpmemobj++/allocator.hpp>
#include <mutex>
using namespace pmem::obj;
auto gpop = pool_base::create("/dev/shm/pmdk_shared", "", (size_t)(400*1024*1024));
std::mutex glock {};
#endif

/**
 * <h1> Persistent Michael-Scott Queue </h1>
 *
 * enqueue algorithm: MS enqueue + CR modifications
 * dequeue algorithm: MS dequeue + CR modifications
 * Consistency: Durable Linearizable
 * enqueue() progress: lock-free
 * dequeue() progress: lock-free
 * Memory Reclamation: Hazard Pointers (lock-free)
 *
 * This Queue is a modification to persistence of the original lock-free algorithm
 * by Maged Michael and Michael Scott. We reduced the number of PWB() and PFENCE()
 * as much as possible and we've added the logic for the recovery.
 * The enqueue()/dequeue() methods don't need a recovery method, but the constructor
 * and destructor are (by definition) not lock-free and therefore need to handle
 * proper recovery.
 *
 * To understand what the PWB/PFENCE/PSYNC are, take a look at
 * "Preserving Happens-Before in persistent memory":
 * https://www.cs.rochester.edu/u/jhi1/papers/2016-spaa-transform
 *
 * We're assuming that CAS has persistent semantics similar to PFENCE() that
 * doesn't act on the load/store of the CAS itself, only on the other loads
 * and stores. In other words, it's as if a CAS is equivalent to a:
 *   PFENCE();
 *   CAS()     // concurrent
 *   PFENCE();
 * The reason we assume this, is because on x86, LOCK instructions and
 * read-modify-write instructions like CAS, ensure order for CLFLUSHOPT and
 * CLWB (PWBs). For more details see Intel's manual for CLFLUSHOPT:
 * https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
 *
 * This algorithm was designed such that on enqueue(), a successful CAS on ltail->next
 * implies that the PWB()s for newNode->item, newNode->next and tail have been done,
 * and a successful CAS on tail means that the PWB() on ltail->next has been done.
 *
 * As for the PWB() and PSYNC() before returning, they're not always needed but
 * it helps to reason about in terms of composability.
 * The only way to observe effects from this queue is to call enqueue() or
 * dequeue(), therefore, the next call to the same method will flush the cache
 * line and persist it.
 * However, if you want to do something like:
 *   q.enqueue(a);
 *   a_is_persisted = true;
 *   PWB(&a_is_persisted);
 * then the only way to guarantee correct ordering, is to have the PWB() and
 * a PSYNC() or PFENCE() before returning from enqueue()/dequeue().
 *
 * Related to the above, there is a trick on enqueue(). Instead of adding the
 * PWB(&tail) and PSYNC(), we don't do that thanks to happens-before relations.
 * Namely, for enqueue() to return it means the CAS on tail has been done, and
 * although the value of tail may not be persisted, the CAS on tail guarantees
 * that the value of the node->next is persisted. This means that if a crash
 * occurs and even if a_is_persisted is persisted and the change to tail occurring
 * on the enqueue does not, it's still ok because the node->next is persisted
 * which will allow the recover() of the queue to advance the tail and persist
 * it as well.
 * Unfortunately, for the dequeue(), no such trick is possible on the head,
 * therefore, we really do need the PWB(&head) and PSYNC() before returning
 * from dequeue().
 *
 * About the constructor:
 * As long as the allocator returns a zeroed-out memory region, the 'head' and
 * 'tail' will be nullptr even if there is a crash immediately at the start of
 * the call to the constructor. If the allocator can't guarantee that the data
 * is zero-ed out, then there is no 100% safe way to distinguish between a
 * proper initialization and trash immediately after* allocating the queue.
 * A crash occurring after the 'head' and 'tail' are made persistent (with nullptr)
 *is always recoverable, although there are a few different cases:
 * - If the head is nullptr then the sentinel node must be allocated;
 * - If the head is non-null but tail is null, then the sentinel was allocated
 *   and assigned to head but not to tail;
 * - If both head and tail are non-null and tail->next is non-null then tail is
 *   not pointing to the last node and we need to advance tail;
 *
 * About the destructor:
 * The destructor must first drain the queue to avoid leaking as much as possible.
 * Then, it needs to de-allocate the last node and zero-out the head to make
 * sure then in the event of a failure, the recovery operation will not try to
 * "recover" the queue, therefore, we have a persistent variable named 'destructorInProgress'
 * which is set before starting the destruction operation.
 * After destructorInProgress has been set to true and ordered with a PFENCE(),
 * we can clear head, and only then can we de-allocate the last node.
 *
 * All allocation and de-allocation operations in this queue are prone to
 * leaking, if the failure occurs immediately before a de-allocation or
 * immediately after an allocation. There is no way around this problem without
 * transactions, and seen as we're trying to get lock-free progress, the
 * transactional mechanism would have to be also lock-free, and there is no
 * lock-free persistent transactional engine published (yet).
 *
 * Even though this is portable C++ code, this is meant for x86 and may not work correctly on other architectures.
 *
 * Maged Michael and Michael Scott's Queue with Hazard Pointers:
 * <p>
 * Lock-Free Linked List as described in Maged Michael and Michael Scott's paper:
 * {@link http://www.cs.rochester.edu/~scott/papers/1996_PODC_queues.pdf}
 * <a href="http://www.cs.rochester.edu/~scott/papers/1996_PODC_queues.pdf">
 * Simple, Fast, and Practical Non-Blocking and Blocking Concurrent Queue Algorithms</a>
 * <p>
 * The paper on Hazard Pointers is named "Hazard Pointers: Safe Memory
 * Reclamation for Lock-Free objects" and it is available here:
 * http://web.cecs.pdx.edu/~walpole/class/cs510/papers/11.pdf
 *
 *
 * TODO: test this on persistence
 *
 */
template<typename T>
class PMichaelScottQueue {

private:
    static const int MAX_THREADS = 128;

    struct Node {
        T item;
        std::atomic<Node*> next {nullptr};
        Node(T userItem) : item{userItem} { }
        bool casNext(Node *cmp, Node *val) {
            return next.compare_exchange_strong(cmp, val);
        }
    };

    bool casTail(Node *cmp, Node *val) {
		return tail.compare_exchange_strong(cmp, val);
	}

    bool casHead(Node *cmp, Node *val) {
        return head.compare_exchange_strong(cmp, val);
    }

    //
    // Persistent variables
    //

    // Pointers to head and tail of the list
    alignas(128) std::atomic<Node*> head {nullptr};
    alignas(128) std::atomic<Node*> tail {nullptr};
    // Will be set to true when the destructor is called, in case there is a crash during destructor
    bool destructorInProgress = false;
    const int maxThreads;

    template<typename TN>
    static void internalDelete(TN* obj) {
#ifdef USE_PMDK_ALLOC
        if (obj == nullptr) return;
        obj->~TN();
        glock.lock();
        transaction::exec_tx(gpop, [obj] () {
            pmemobj_tx_free(pmemobj_oid(obj));
        });
        glock.unlock();
#else
        delete obj;
#endif
    }

    template<typename TN, typename... Args>
    static TN* internalNew(Args&&... args) {
#ifdef USE_PMDK_ALLOC
        glock.lock();
        void *addr = nullptr;
        transaction::exec_tx(gpop, [&addr] () {
            auto oid = pmemobj_tx_alloc(sizeof(TN), 0);
            addr = pmemobj_direct(oid);
        });
        glock.unlock();
        return new (addr) TN(std::forward<Args>(args)...); // placement new
#else
        return new TN(std::forward<Args>(args)...);
#endif
    }

    std::function<void(Node*,int)> mydeleter = [](Node* node, int tid){ internalDelete(node); };

    // We need two hazard pointers for dequeue()
    // This variable is a non-volatile pointer to a volatile object
    HazardPointers<Node>* hp  = new HazardPointers<Node>{2, maxThreads,mydeleter};
    static const int kHpTail = 0;
    static const int kHpHead = 0;
    static const int kHpNext = 1;


    /*
     * To be called when restarting after a failure
     */
    void recover() {
        if (destructorInProgress) {
            if (head.load(std::memory_order_relaxed) != nullptr) {
                while (dequeue(0) != EMPTY); // Drain the queue
                head.store(nullptr, std::memory_order_relaxed);
                PWB(&head);
                PFENCE();
                internalDelete(head.load(std::memory_order_relaxed));  // Delete the last node
            }
            PSYNC();
            return;
        }
        hp = new HazardPointers<Node>{2, maxThreads,mydeleter};
        // If both head is null then a failure occurred during constructor
        if (head.load(std::memory_order_relaxed) == nullptr) {
            Node* sentinelNode = internalNew<Node>(T{});
            head.store(sentinelNode, std::memory_order_relaxed);
            PWB(&head);
            PFENCE();
        }
        // If tail is null, then fix it by setting it to head
        if (tail.load(std::memory_order_relaxed) == nullptr) {
            tail.store(head.load(std::memory_order_relaxed), std::memory_order_relaxed);
            PWB(&tail);
            PFENCE();
        }
        // Advance the tail if needed
        Node* ltail = tail.load(std::memory_order_relaxed);
        Node* lnext = ltail->next.load(std::memory_order_relaxed);
        if (lnext != nullptr) {
            tail.store(lnext, std::memory_order_relaxed);
            PWB(&tail);
        }
        PSYNC();
    }


public:
    T EMPTY {};

    PMichaelScottQueue(int maxThreads=MAX_THREADS) : maxThreads{maxThreads} {
        PWB(&head);
        PWB(&tail);
        PFENCE();
        recover();  // re-use the same code as the recovery method
    }

    ~PMichaelScottQueue() {
        destructorInProgress = true;
        PWB(&destructorInProgress);
        PFENCE();
        recover();  // Re-use the same code as in the recovery method
    }

    static std::string className() { return "PMichaelScottQueue"; }

    /*
     * Uncontended: at least 3 PWB()s, 2 CAS
     */
    void enqueue(T item, const int tid) {
        Node* newNode = internalNew<Node>(item);
        PWB(&newNode->item);
        PWB(&newNode->next); // Just in case 'item' and 'next' are not on the same cache line
        while (true) {
            Node* ltail = hp->protectPtr(kHpTail, tail, tid);
            if (ltail == tail.load()) {
                Node* lnext = ltail->next.load();
                if (lnext == nullptr) {
                    PWB(&tail);
                    if (ltail->casNext(nullptr, newNode)) {
                        PWB(&ltail->next);
                        casTail(ltail, newNode);
                        hp->clear(tid);
                        return;
                    }
                } else {
                    PWB(&ltail->next);
                    casTail(ltail, lnext);
                }
            }
        }
    }

    /*
     * Uncontended: at least 2 PWB()s, 1 CAS, and 1 PSYNC()
     */
    T dequeue(const int tid) {
        Node* node = hp->protect(kHpHead, head, tid);
        while (node != tail.load()) {
            Node* lnext = hp->protect(kHpNext, node->next, tid);
            PWB(&tail);
            PWB(&head);
            if (casHead(node, lnext)) {
                PWB(&head);
                PSYNC();
                T item = lnext->item;  // Another thread may clean up lnext after we do hp->clear()
                hp->clear(tid);
                hp->retire(node, tid);  // TODO: replace the internal de-allocator with an NVM de-allocator
                return item;
            }
            node = hp->protect(kHpHead, head, tid);
        }
        hp->clear(tid);
        return EMPTY;                  // Queue is empty
    }
};

#endif /* _PERSISTENT_MICHAEL_SCOTT_QUEUE_HP_H_ */


================================================
FILE: pdatastructures/pqueues/POFLFLinkedListQueue.hpp
================================================
/*
 * Copyright 2017-2019
 *   Andreia Correia <andreia.veiga@unine.ch>
 *   Pedro Ramalhete <pramalhe@gmail.com>
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Nachshon Cohen <nachshonc@gmail.com>
 *
 * This work is published under the MIT license. See LICENSE.txt
 */
#ifndef _PERSISTENT_OF_LF_LINKED_LIST_QUEUE_H_
#define _PERSISTENT_OF_LF_LINKED_LIST_QUEUE_H_

#include <stdexcept>

#include "ptms/OneFilePTMLF.hpp"

/**
 * <h1> A Linked List queue using OneFile PTM (Lock-Free) </h1>
 *
 * enqueue algorithm: sequential implementation + OF-LF
 * dequeue algorithm: sequential implementation + OF-LF
 * Consistency: Linearizable
 * enqueue() progress: lock-free
 * dequeue() progress: lock-free
 * Memory Reclamation: Hazard Eras (integrated into OF-LF)
 * enqueue min ops: 2 DCAS + 1 CAS
 * dequeue min ops: 1 DCAS + 1 CAS
 */
template<typename T>
class POFLFLinkedListQueue : public poflf::tmbase {

private:
    struct Node : poflf::tmbase {
        poflf::tmtype<T> item;
        poflf::tmtype<Node*> next {nullptr};
        Node(T userItem) : item{userItem} { }
    };

    poflf::tmtype<Node*>  head {nullptr};
    poflf::tmtype<Node*>  tail {nullptr};


public:
    T EMPTY {};

    POFLFLinkedListQueue(unsigned int maxThreads=0) {
        poflf::updateTx([=] () {
            Node* sentinelNode = poflf::tmNew<Node>(EMPTY);
            head = sentinelNode;
            tail = sentinelNode;
        });
    }


    ~POFLFLinkedListQueue() {
        poflf::updateTx([=] () {
            while (dequeue() != EMPTY); // Drain the queue
            Node* lhead = head;
            poflf::tmDelete(lhead);
        });
    }


    static std::string className() { return "POF-LF-LinkedListQueue"; }


    /*
     * Progress Condition: lock-free
     * Always returns true
     */
    bool enqueue(T item, const int tid=0) {
        if (item == EMPTY) throw std::invalid_argument("item can not be nullptr");
        return poflf::updateTx<bool>([this,item] () -> bool {
            Node* newNode = poflf::tmNew<Node>(item);
            tail->next = newNode;
            tail = newNode;
            return true;
        });
    }


    /*
     * Progress Condition: lock-free
     */
    T dequeue(const int tid=0) {
        return poflf::updateTx<T>([this] () -> T {
            Node* lhead = head;
            if (lhead == tail) return EMPTY;
            head = lhead->next;
            poflf::tmDelete(lhead);
            return head->item;
        });
    }
};

#endif /* _PERSISTENT_OF_LF_LINKED_LIST_QUEUE_H_ */


================================================
FILE: pdatastructures/pqueues/POFLFMPLinkedListQueue.hpp
================================================
/*
 * Copyright 2017-2019
 *   Andreia Correia <andreia.veiga@unine.ch>
 *   Pedro Ramalhete <pramalhe@gmail.com>
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Nachshon Cohen <nachshonc@gmail.com>
 *
 * This work is published under the MIT license. See LICENSE.txt
 */
#ifndef _PERSISTENT_OF_LF_MP_LINKED_LIST_QUEUE_H_
#define _PERSISTENT_OF_LF_MP_LINKED_LIST_QUEUE_H_

#include <stdexcept>

#include "ptms/OneFilePTMLFMultiProcess.hpp"

/**
 * <h1> A Linked List queue using OneFile Multi-Process PTM (Lock-Free) </h1>
 *
 * enqueue algorithm: sequential implementation + OF-LF
 * dequeue algorithm: sequential implementation + OF-LF
 * Consistency: Linearizable
 * enqueue() progress: lock-free
 * dequeue() progress: lock-free
 * Memory Reclamation: Hazard Eras (integrated into OF-LF)
 * enqueue min ops: 2 DCAS + 1 CAS
 * dequeue min ops: 1 DCAS + 1 CAS
 */
template<typename T>
class POFLFMPLinkedListQueue : public onefileptmlfmp::tmbase {

private:
    struct Node : onefileptmlfmp::tmbase {
        onefileptmlfmp::tmtype<T> item;
        onefileptmlfmp::tmtype<Node*> next {nullptr};
        Node(T userItem) : item{userItem} { }
    };

    onefileptmlfmp::tmtype<Node*>  head {nullptr};
    onefileptmlfmp::tmtype<Node*>  tail {nullptr};


public:
    T EMPTY {};

    POFLFMPLinkedListQueue(unsigned int maxThreads=0) {
        onefileptmlfmp::updateTx([=] () {
            Node* sentinelNode = onefileptmlfmp::tmNew<Node>(EMPTY);
            head = sentinelNode;
            tail = sentinelNode;
        });
    }


    ~POFLFMPLinkedListQueue() {
        onefileptmlfmp::updateTx([=] () {
            while (dequeue() != EMPTY); // Drain the queue
            Node* lhead = head;
            onefileptmlfmp::tmDelete(lhead);
        });
    }


    static std::string className() { return "POF-LF-MP-LinkedListQueue"; }


    /*
     * Progress Condition: lock-free
     * Always returns true
     */
    bool enqueue(T item, const int tid=0) {
        if (item == EMPTY) throw std::invalid_argument("item can not be nullptr");
        return onefileptmlfmp::updateTx<bool>([this,item] () -> bool {
            Node* newNode = onefileptmlfmp::tmNew<Node>(item);
            tail->next = newNode;
            tail = newNode;
            return true;
        });
    }


    /*
     * Progress Condition: lock-free
     */
    T dequeue(const int tid=0) {
        return onefileptmlfmp::updateTx<T>([this] () -> T {
            Node* lhead = head;
            if (lhead == tail) return EMPTY;
            head = lhead->next;
            onefileptmlfmp::tmDelete(lhead);
            return head->item;
        });
    }
};

#endif /* _PERSISTENT_OF_LF_LINKED_LIST_QUEUE_H_ */


================================================
FILE: pdatastructures/pqueues/POFWFLinkedListQueue.hpp
================================================
/*
 * Copyright 2017-2019
 *   Andreia Correia <andreia.veiga@unine.ch>
 *   Pedro Ramalhete <pramalhe@gmail.com>
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Nachshon Cohen <nachshonc@gmail.com>
 *
 * This work is published under the MIT license. See LICENSE.txt
 */
#ifndef _PERSISTENT_OF_WF_LINKED_LIST_QUEUE_H_
#define _PERSISTENT_OF_WF_LINKED_LIST_QUEUE_H_

#include <stdexcept>

#include "ptms/OneFilePTMWF.hpp"

/**
 * <h1> A Linked List queue using OneFile PTM (Wait-Free) </h1>
 *
 * enqueue algorithm: sequential implementation + OF-WF
 * dequeue algorithm: sequential implementation + OF-WF
 * Consistency: Linearizable
 * enqueue() progress: wait-free
 * dequeue() progress: wait-free
 * enqueue min ops: 2 DCAS + 1 CAS
 * dequeue min ops: 1 DCAS + 1 CAS
 */
template<typename T>
class POFWFLinkedListQueue : public pofwf::tmbase {

private:
    struct Node : pofwf::tmbase {
        pofwf::tmtype<T> item;
        pofwf::tmtype<Node*> next {nullptr};
        Node(T userItem) : item{userItem} { }
    };

    pofwf::tmtype<Node*>  head {nullptr};
    pofwf::tmtype<Node*>  tail {nullptr};


public:
    T EMPTY {};

    POFWFLinkedListQueue(unsigned int maxThreads=0) {
        pofwf::updateTx([=] () {
            Node* sentinelNode = pofwf::tmNew<Node>(EMPTY);
            head = sentinelNode;
            tail = sentinelNode;
        });
    }


    ~POFWFLinkedListQueue() {
        pofwf::updateTx([=] () {
            while (dequeue() != EMPTY); // Drain the queue
            Node* lhead = head;
            pofwf::tmDelete(lhead);
        });
    }


    static std::string className() { return "POF-WF-LinkedListQueue"; }


    /*
     * Progress Condition: wait-free
     * Always returns true
     */
    bool enqueue(T item, const int tid=0) {
        if (item == EMPTY) throw std::invalid_argument("item can not be nullptr");
        return pofwf::updateTx<bool>([this,item] () -> bool {
            Node* newNode = pofwf::tmNew<Node>(item);
            tail->next = newNode;
            tail = newNode;
            return true;
        });
    }


    /*
     * Progress Condition: wait-free
     */
    T dequeue(const int tid=0) {
        return pofwf::updateTx<T>([this] () -> T {
            Node* lhead = head;
            if (lhead == tail) return EMPTY;
            head = lhead->next;
            pofwf::tmDelete(lhead);
            return head->item;
        });
    }
};

#endif /* _PERSISTENT_OF_WF_LINKED_LIST_QUEUE_H_ */


================================================
FILE: pdatastructures/pqueues/RomLRLinkedListQueue.hpp
================================================
/*
 * Copyright 2017-2018
 *   Andreia Correia <andreia.veiga@unine.ch>
 *   Pedro Ramalhete <pramalhe@gmail.com>
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Nachshon Cohen <nachshonc@gmail.com>
 *
 * This work is published under the MIT license. See LICENSE.txt
 */
#ifndef _PERSISTENT_ROMULUS_LR_LINKED_LIST_QUEUE_H_
#define _PERSISTENT_ROMULUS_LR_LINKED_LIST_QUEUE_H_

#include <stdexcept>

#include "ptms/romuluslr/RomulusLR.hpp"

/**
 * <h1> A Linked List queue using Romulus Left-Right PTM (blocking) </h1>
 */
template<typename T>
class RomLRLinkedListQueue {

private:
    struct Node {
        romuluslr::persist<T> item;
        romuluslr::persist<Node*> next {nullptr};
        Node(T userItem) : item{userItem} { }
    };

    alignas(128) romuluslr::persist<Node*>  head {nullptr};
    alignas(128) romuluslr::persist<Node*>  tail {nullptr};


public:
    T EMPTY {};

    RomLRLinkedListQueue(unsigned int maxThreads=0) {
        romuluslr::RomulusLR::updateTx([&] () {
            Node* sentinelNode = romuluslr::RomulusLR::tmNew<Node>(EMPTY);
            head = sentinelNode;
            tail = sentinelNode;
        });
    }


    ~RomLRLinkedListQueue() {
        romuluslr::RomulusLR::updateTx([&] () {
            while (dequeue() != EMPTY); // Drain the queue
            Node* lhead = head;
            romuluslr::RomulusLR::tmDelete(lhead);
        });
    }


    static std::string className() { return "RomulusLR-LinkedListQueue"; }


    /*
     * Progress Condition: lock-free
     * Always returns true
     */
    bool enqueue(T item, const int tid=0) {
        if (item == EMPTY) throw std::invalid_argument("item can not be nullptr");
        romuluslr::RomulusLR::updateTx([&] () {
            Node* newNode = romuluslr::RomulusLR::tmNew<Node>(item);
            tail->next = newNode;
            tail = newNode;
        });
        return true;
    }


    /*
     * Progress Condition: lock-free
     */
    T dequeue(const int tid=0) {
        T item = EMPTY;
        romuluslr::RomulusLR::updateTx<T>([&] () {
            Node* lhead = head;
            if (lhead == tail) return;
            head = lhead->next;
            romuluslr::RomulusLR::tmDelete(lhead);
            item = head->item;
        });
        return item;
    }
};

#endif /* _PERSISTENT_ROMULUS_LR_LINKED_LIST_QUEUE_H_ */


================================================
FILE: pdatastructures/pqueues/RomLogLinkedListQueue.hpp
================================================
/*
 * Copyright 2017-2018
 *   Andreia Correia <andreia.veiga@unine.ch>
 *   Pedro Ramalhete <pramalhe@gmail.com>
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Nachshon Cohen <nachshonc@gmail.com>
 *
 * This work is published under the MIT license. See LICENSE.txt
 */
#ifndef _PERSISTENT_ROMULUS_LOG_LINKED_LIST_QUEUE_H_
#define _PERSISTENT_ROMULUS_LOG_LINKED_LIST_QUEUE_H_

#include <stdexcept>

#include "ptms/romuluslog/RomulusLog.hpp"

/**
 * <h1> A Linked List queue using RomulusLog PTM (blocking) </h1>
 */
template<typename T>
class RomLogLinkedListQueue {

private:
    struct Node {
        romuluslog::persist<T> item;
        romuluslog::persist<Node*> next {nullptr};
        Node(T userItem) : item{userItem} { }
    };

    alignas(128) romuluslog::persist<Node*>  head {nullptr};
    alignas(128) romuluslog::persist<Node*>  tail {nullptr};


public:
    T EMPTY {};

    RomLogLinkedListQueue(unsigned int maxThreads=0) {
        romuluslog::RomulusLog::updateTx([&] () {
            Node* sentinelNode = romuluslog::RomulusLog::tmNew<Node>(EMPTY);
            head = sentinelNode;
            tail = sentinelNode;
        });
    }


    ~RomLogLinkedListQueue() {
        romuluslog::RomulusLog::updateTx([&] () {
            while (dequeue() != EMPTY); // Drain the queue
            Node* lhead = head;
            romuluslog::RomulusLog::tmDelete(lhead);
        });
    }


    static std::string className() { return "RomulusLog-LinkedListQueue"; }


    /*
     * Progress Condition: blocking
     * Always returns true
     */
    bool enqueue(T item, const int tid=0) {
        if (item == EMPTY) throw std::invalid_argument("item can not be nullptr");
        romuluslog::RomulusLog::updateTx([&] () {
            Node* newNode = romuluslog::RomulusLog::tmNew<Node>(item);
            tail->next = newNode;
            tail = newNode;
        });
        return true;
    }


    /*
     * Progress Condition: blocking
     */
    T dequeue(const int tid=0) {
        T item = EMPTY;
        romuluslog::RomulusLog::updateTx<T>([&] () {
            Node* lhead = head;
            if (lhead == tail) return;
            head = lhead->next;
            romuluslog::RomulusLog::tmDelete(lhead);
            item = head->item;
        });
        return item;
    }
};

#endif /* _PERSISTENT_ROMULUS_LOG_LINKED_LIST_QUEUE_H_ */


================================================
FILE: ptms/OneFilePTMLF.hpp
================================================
/*
 * Copyright 2017-2019
 *   Andreia Correia <andreia.veiga@unine.ch>
 *   Pedro Ramalhete <pramalhe@gmail.com>
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Nachshon Cohen <nachshonc@gmail.com>
 *
 * This work is published under the MIT license. See LICENSE.txt
 */
#ifndef _PERSISTENT_ONE_FILE_LOCK_FREE_TRANSACTIONAL_MEMORY_H_
#define _PERSISTENT_ONE_FILE_LOCK_FREE_TRANSACTIONAL_MEMORY_H_

#include <atomic>
#include <cassert>
#include <iostream>
#include <vector>
#include <functional>
#include <cstring>
#include <sys/mman.h>   // Needed if we use mmap()
#include <sys/types.h>  // Needed by open() and close()
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>     // Needed by close()

// Please keep this file in sync (as much as possible) with stms/OneFileLF.hpp

// Macros needed for persistence
#ifdef PWB_IS_CLFLUSH
  /*
   * More info at http://elixir.free-electrons.com/linux/latest/source/arch/x86/include/asm/special_insns.h#L213
   * Intel programming manual at https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
   * Use these for Broadwell CPUs (cervino server)
   */
  #define PWB(addr)              __asm__ volatile("clflush (%0)" :: "r" (addr) : "memory")                  // Broadwell only works with this.
  #define PFENCE()               {}                                                                         // No ordering fences needed for CLFLUSH (section 7.4.6 of Intel manual)
  #define PSYNC()                {}                                                                         // For durability it's not obvious, but CLFLUSH seems to be enough, and PMDK uses the same approach
#elif PWB_IS_CLWB
  /* Use this for CPUs that support clwb, such as the SkyLake SP series (c5 compute intensive instances in AWS are an example of it) */
  #define PWB(addr)              __asm__ volatile(".byte 0x66; xsaveopt %0" : "+m" (*(volatile char *)(addr)))  // clwb() only for Ice Lake onwards
  #define PFENCE()               __asm__ volatile("sfence" : : : "memory")
  #define PSYNC()                __asm__ volatile("sfence" : : : "memory")
#elif PWB_IS_NOP
  /* pwbs are not needed for shared memory persistency (i.e. persistency across process failure) */
  #define PWB(addr)              {}
  #define PFENCE()               __asm__ volatile("sfence" : : : "memory")
  #define PSYNC()                __asm__ volatile("sfence" : : : "memory")
#elif PWB_IS_CLFLUSHOPT
  /* Use this for CPUs that support clflushopt, which is most recent x86 */
  #define PWB(addr)              __asm__ volatile(".byte 0x66; clflush %0" : "+m" (*(volatile char *)(addr)))    // clflushopt (Kaby Lake)
  #define PFENCE()               __asm__ volatile("sfence" : : : "memory")
  #define PSYNC()                __asm__ volatile("sfence" : : : "memory")
#else
#error "You must define what PWB is. Choose PWB_IS_CLFLUSHOPT if you don't know what your CPU is capable of"
#endif


/*
 * Differences between POneFileLF and the non-persistent OneFileLF:
 * - A secondary redo log (PWriteSet) is placed in persistent memory before attempting a 'commit'.
 * - The set of the request in helpApply() is always done with a CAS to enforce ordering on the PWBs of the DCAS;
 * - The persistent logs are allocated in PM, same as all user allocations from tmNew(), 'curTx', and 'request'
 */
namespace poflf {

//
// User configurable variables.
// Feel free to change these if you need larger transactions, more allocations per transacation, or more threads.
//

// Maximum number of registered threads that can execute transactions
static const int REGISTRY_MAX_THREADS = 128;
// Maximum number of stores in the WriteSet per transaction
static const uint64_t TX_MAX_STORES = 40*1024;
// Number of buckets in the hashmap of the WriteSet.
static const uint64_t HASH_BUCKETS = 2048;

// Persistent-specific configuration
// Name of persistent file mapping
static const char * PFILE_NAME = "/dev/shm/ponefilelf_shared";
// Start address of mapped persistent memory
static uint8_t* PREGION_ADDR = (uint8_t*)0x7fea00000000;
// Size of persistent memory. Part of it will be used by the redo logs
static const uint64_t PREGION_SIZE = 1024*1024*1024ULL;   // 1 GB by default
// End address of mapped persistent memory
static uint8_t* PREGION_END = (PREGION_ADDR+PREGION_SIZE);
// Maximum number of root pointers available for the user
static const uint64_t MAX_ROOT_POINTERS = 100;


// DCAS / CAS2 macro
#define DCAS(ptr, o1, o2, n1, n2)                               \
({                                                              \
    char __ret;                                                 \
    __typeof__(o2) __junk;                                      \
    __typeof__(*(ptr)) __old1 = (o1);                           \
    __typeof__(o2) __old2 = (o2);                               \
    __typeof__(*(ptr)) __new1 = (n1);                           \
    __typeof__(o2) __new2 = (n2);                               \
    asm volatile("lock cmpxchg16b %2;setz %1"                   \
                   : "=d"(__junk), "=a"(__ret), "+m" (*ptr)     \
                   : "b"(__new1), "c"(__new2),                  \
                     "a"(__old1), "d"(__old2));                 \
    __ret; })


// Functions to convert between a transaction identifier (uint64_t) and a pair of {sequence,index}
static inline uint64_t seqidx2trans(uint64_t seq, uint64_t idx) {
    return (seq << 10) | idx;
}
static inline uint64_t trans2seq(uint64_t trans) {
    return trans >> 10;
}
static inline uint64_t trans2idx(uint64_t trans) {
    return trans & 0x3FF; // 10 bits
}

// Flush each cache line in a range
static inline void flushFromTo(void* from, void* to) noexcept {
    const uint64_t cache_line_size = 64;
    uint8_t* ptr = (uint8_t*)(((uint64_t)from) & (~(cache_line_size-1)));
    for (; ptr < (uint8_t*)to; ptr += cache_line_size) PWB(ptr);
}


//
// Thread Registry stuff
//
extern void thread_registry_deregister_thread(const int tid);

// An helper class to do the checkin and checkout of the thread registry
struct ThreadCheckInCheckOut {
    static const int NOT_ASSIGNED = -1;
    int tid { NOT_ASSIGNED };
    ~ThreadCheckInCheckOut() {
        if (tid == NOT_ASSIGNED) return;
        thread_registry_deregister_thread(tid);
    }
};

extern thread_local ThreadCheckInCheckOut tl_tcico;

// Forward declaration of global/singleton instance
class ThreadRegistry;
extern ThreadRegistry gThreadRegistry;

/*
 * <h1> Registry for threads </h1>
 *
 * This is singleton type class that allows assignement of a unique id to each thread.
 * The first time a thread calls ThreadRegistry::getTID() it will allocate a free slot in 'usedTID[]'.
 * This tid wil be saved in a thread-local variable of the type ThreadCheckInCheckOut which
 * upon destruction of the thread will call the destructor of ThreadCheckInCheckOut and free the
 * corresponding slot to be used by a later thread.
 */
class ThreadRegistry {
private:
    alignas(128) std::atomic<bool>      usedTID[REGISTRY_MAX_THREADS];   // Which TIDs are in use by threads
    alignas(128) std::atomic<int>       maxTid {-1};                     // Highest TID (+1) in use by threads

public:
    ThreadRegistry() {
        for (int it = 0; it < REGISTRY_MAX_THREADS; it++) {
            usedTID[it].store(false, std::memory_order_relaxed);
        }
    }

    // Progress condition: wait-free bounded (by the number of threads)
    int register_thread_new(void) {
        for (int tid = 0; tid < REGISTRY_MAX_THREADS; tid++) {
            if (usedTID[tid].load(std::memory_order_acquire)) continue;
            bool unused = false;
            if (!usedTID[tid].compare_exchange_strong(unused, true)) continue;
            // Increase the current maximum to cover our thread id
            int curMax = maxTid.load();
            while (curMax <= tid) {
                maxTid.compare_exchange_strong(curMax, tid+1);
                curMax = maxTid.load();
            }
            tl_tcico.tid = tid;
            return tid;
        }
        std::cout << "ERROR: Too many threads, registry can only hold " << REGISTRY_MAX_THREADS << " threads\n";
        assert(false);
    }

    // Progress condition: wait-free population oblivious
    inline void deregister_thread(const int tid) {
        usedTID[tid].store(false, std::memory_order_release);
    }

    // Progress condition: wait-free population oblivious
    static inline uint64_t getMaxThreads(void) {
        return gThreadRegistry.maxTid.load(std::memory_order_acquire);
    }

    // Progress condition: wait-free bounded (by the number of threads)
    static inline int getTID(void) {
        int tid = tl_tcico.tid;
        if (tid != ThreadCheckInCheckOut::NOT_ASSIGNED) return tid;
        return gThreadRegistry.register_thread_new();
    }
};


// Forward declaration needed by EsLoco
template<typename T> struct tmtype;

// We need to split the contents from the methods due to compilation dependencies
template<typename T> struct tmtypebase {
    // Stores the actual value as an atomic
    std::atomic<uint64_t>  val;
    // Lets hope this comes immediately after 'val' in memory mapping, otherwise the DCAS() will fail
    std::atomic<uint64_t>  seq;
};


/*
 * EsLoco is an Extremely Simple memory aLOCatOr
 *
 * It is based on intrusive singly-linked lists (a free-list), one for each power of two size.
 * All blocks are powers of two, the smallest size enough to contain the desired user data plus the block header.
 * There is an array named 'freelists' where each entry is a pointer to the head of a stack for that respective block size.
 * Blocks are allocated in powers of 2 of words (64bit words).
 * Each block has an header with two words: the size of the node (in words), the pointer to the next node.
 * The minimum block size is 4 words, with 2 for the header and 2 for the user.
 * When there is no suitable block in the freelist, it will create a new block from the remaining pool.
 *
 * EsLoco was designed for usage in PTMs but it doesn't have to be used only for that.
 * Average number of stores for an allocation is 1.
 * Average number of stores for a de-allocation is 2.
 *
 * Memory layout:
 * ------------------------------------------------------------------------
 * | poolTop | freelists[0] ... freelists[61] | ... allocated objects ... |
 * ------------------------------------------------------------------------
 */
template <template <typename> class P>
class EsLoco {
private:
    struct block {
        P<block*>   next;   // Pointer to next block in free-list (when block is in free-list)
        P<uint64_t> size;   // Exponent of power of two of the size of this block in bytes.
    };

    const bool debugOn = false;

    // Volatile data
    uint8_t* poolAddr {nullptr};
    uint64_t poolSize {0};

    // Pointer to array of persistent heads of free-list
    block* freelists {nullptr};
    // Volatile pointer to persistent pointer to last unused address (the top of the pool)
    P<uint8_t*>* poolTop {nullptr};

    // Number of blocks in the freelists array.
    // Each entry corresponds to an exponent of the block size: 2^4, 2^5, 2^6... 2^40
    static const int kMaxBlockSize = 50; // 1024 TB of memory should be enough

    // For powers of 2, returns the highest bit, otherwise, returns the next highest bit
    uint64_t highestBit(uint64_t val) {
        uint64_t b = 0;
        while ((val >> (b+1)) != 0) b++;
        if (val > (1ULL << b)) return b+1;
        return b;
    }

    uint8_t* aligned(uint8_t* addr) {
        return (uint8_t*)((size_t)addr & (~0x3FULL)) + 128;
    }

public:
    void init(void* addressOfMemoryPool, size_t sizeOfMemoryPool, bool clearPool=true) {
        // Align the base address of the memory pool
        poolAddr = aligned((uint8_t*)addressOfMemoryPool);
        poolSize = sizeOfMemoryPool + (uint8_t*)addressOfMemoryPool - poolAddr;
        // The first thing in the pool is a pointer to the top of the pool
        poolTop = (P<uint8_t*>*)poolAddr;
        // The second thing in the pool is the array of freelists
        freelists = (block*)(poolAddr + sizeof(*poolTop));
        if (clearPool) {
            std::memset(poolAddr, 0, poolSize);
            for (int i = 0; i < kMaxBlockSize; i++) freelists[i].next.pstore(nullptr);
            // The size of the freelists array in bytes is sizeof(block)*kMaxBlockSize
            // Align to cache line boundary (DCAS needs 16 byte alignment)
            poolTop->pstore(aligned(poolAddr + sizeof(*poolTop) + sizeof(block)*kMaxBlockSize));
        }
        if (debugOn) printf("Starting EsLoco with poolAddr=%p and poolSize=%ld, up to %p\n", poolAddr, poolSize, poolAddr+poolSize);
    }

    // Resets the metadata of the allocator back to its defaults
    void reset() {
        std::memset(poolAddr, 0, sizeof(block)*kMaxBlockSize);
        poolTop->pstore(nullptr);
    }

    // Returns the number of bytes that may (or may not) have allocated objects, from the base address to the top address
    uint64_t getUsedSize() {
        return poolTop->pload() - poolAddr;
    }

    // Takes the desired size of the object in bytes.
    // Returns pointer to memory in pool, or nullptr.
    // Does on average 1 store to persistent memory when re-utilizing blocks.
    void* malloc(size_t size) {
        P<uint8_t*>* top = (P<uint8_t*>*)(((uint8_t*)poolTop));
        block* flists = (block*)(((uint8_t*)freelists));
        // Adjust size to nearest (highest) power of 2
        uint64_t bsize = highestBit(size + sizeof(block));
        if (debugOn) printf("malloc(%ld) requested,  block size exponent = %ld\n", size, bsize);
        block* myblock = nullptr;
        // Check if there is a block of that size in the corresponding freelist
        if (flists[bsize].next.pload() != nullptr) {
            if (debugOn) printf("Found available block in freelist\n");
            // Unlink block
            myblock = flists[bsize].next;
            flists[bsize].next = myblock->next;          // pstore()
        } else {
            if (debugOn) printf("Creating new block from top, currently at %p\n", top->pload());
            // Couldn't find a suitable block, get one from the top of the pool if there is one available
            if (top->pload() + (1<<bsize) > poolSize + poolAddr) {
                printf("EsLoco: Out of memory for %ld bytes allocation\n", size);
                return nullptr;
            }
            myblock = (block*)top->pload();
            top->pstore(top->pload() + (1<<bsize));      // pstore()
            myblock->size = bsize;                       // pstore()
        }
        if (debugOn) printf("returning ptr = %p\n", (void*)((uint8_t*)myblock + sizeof(block)));
        // Return the block, minus the header
        return (void*)((uint8_t*)myblock + sizeof(block));
    }

    // Takes a pointer to an object and puts the block on the free-list.
    // Does on average 2 stores to persistent memory.
    void free(void* ptr) {
        if (ptr == nullptr) return;
        block* flists = (block*)(((uint8_t*)freelists));
        block* myblock = (block*)((uint8_t*)ptr - sizeof(block));
        if (debugOn) printf("free(%p)  block size exponent = %ld\n", ptr, myblock->size.pload());
        // Insert the block in the corresponding freelist
        myblock->next = flists[myblock->size].next;      // pstore()
        flists[myblock->size].next = myblock;            // pstore()
    }
};


// Needed by our benchmarks
struct tmbase {
};


// An entry in the persistent write-set
struct PWriteSetEntry {
    void*    addr;  // Address of value+sequence to change
    uint64_t val;   // Desired value to change to
};


// The persistent write-set (undo log)
struct PWriteSet {
    uint64_t              numStores {0};          // Number of stores in the writeSet for the current transaction
    std::atomic<uint64_t> request {0};            // Can be moved to CLOSED by other threads, using a CAS
    PWriteSetEntry        plog[TX_MAX_STORES];    // Redo log of stores

    // Applies all entries in the log. Called only by recover() which is non-concurrent.
    void applyFromRecover() {
        // We're assuming that 'val' is the size of a uint64_t
        for (uint64_t i = 0; i < numStores; i++) {
            *((uint64_t*)plog[i].addr) = plog[i].val;
            PWB(plog[i].addr);
        }
    }
};


// The persistent metadata is a 'header' that contains all the logs and the persistent curTx variable.
// It is located at the start of the persistent region, and the remaining region contains the data available for the allocator to use.
struct PMetadata {
    static const uint64_t   MAGIC_ID = 0x1337babe;
    std::atomic<uint64_t>   curTx {seqidx2trans(1,0)};
    std::atomic<uint64_t>   pad1[15];
    tmtypebase<void*>       rootPtrs[MAX_ROOT_POINTERS];
    PWriteSet               plog[REGISTRY_MAX_THREADS];
    uint64_t                id {0};
    uint64_t                pad2 {0};
};


// A single entry in the write-set
struct WriteSetEntry {
    void*          addr {nullptr};  // Address of value+sequence to change
    uint64_t       val;             // Desired value to change to
    WriteSetEntry* next {nullptr};  // Pointer to next node in the (intrusive) hash map
};

extern thread_local bool tl_is_read_only;


// The write-set is a log of the words modified during the transaction.
// This log is an array with an intrusive hashmap of size HASH_BUCKETS.
struct WriteSet {
    static const uint64_t MAX_ARRAY_LOOKUP = 30;  // Beyond this, it seems to be faster to use the hashmap
    WriteSetEntry         log[TX_MAX_STORES];     // Redo log of stores
    uint64_t              numStores {0};          // Number of stores in the writeSet for the current transaction
    WriteSetEntry*        buckets[HASH_BUCKETS];  // Intrusive HashMap for fast lookup in large(r) transactions

    WriteSet() {
        numStores = 0;
        for (int i = 0; i < HASH_BUCKETS; i++) buckets[i] = &log[TX_MAX_STORES-1];
    }

    // Copies the current write set to persistent memory
    inline void persistAndFlushLog(PWriteSet* const pwset) {
        for (uint64_t i = 0; i < numStores; i++) {
            pwset->plog[i].addr = log[i].addr;
            pwset->plog[i].val = log[i].val;
        }
        pwset->numStores = numStores;
        // Flush the log and the numStores variable
        flushFromTo(&pwset->numStores, &pwset->plog[numStores+1]);
    }

    // Uses the log to flush the modifications to NVM.
    // We assume tmtype does not cross cache line boundaries.
    inline void flushModifications() {
        for (uint64_t i = 0; i < numStores; i++) PWB(log[i].addr);
    }

    // Each address on a different bucket
    inline uint64_t hash(const void* addr) const {
        return (((uint64_t)addr) >> 3) % HASH_BUCKETS;
    }

    // Adds a modification to the redo log
    inline void addOrReplace(void* addr, uint64_t val) {
        if (tl_is_read_only) tl_is_read_only = false;
        const uint64_t hashAddr = hash(addr);
        if (numStores < MAX_ARRAY_LOOKUP) {
            // Lookup in array
            for (unsigned int idx = 0; idx < numStores; idx++) {
                if (log[idx].addr == addr) {
                    log[idx].val = val;
                    return;
                }
            }
        } else {
            // Lookup in hashmap
            WriteSetEntry* be = buckets[hashAddr];
            if (be < &log[numStores] && hash(be->addr) == hashAddr) {
                while (be != nullptr) {
                    if (be->addr == addr) {
                        be->val = val;
                        return;
                    }
                    be = be->next;
                }
            }
        }
        // Add to array
        WriteSetEntry* e = &log[numStores++];
        assert(numStores < TX_MAX_STORES);
        e->addr = addr;
        e->val = val;
        // Add to hashmap
        WriteSetEntry* be = buckets[hashAddr];
        // Clear if entry is from previous tx
        e->next = (be < e && hash(be->addr) == hashAddr) ? be : nullptr;
        buckets[hashAddr] = e;
    }

    // Does a lookup on the WriteSet for an addr.
    // If the numStores is lower than MAX_ARRAY_LOOKUP, the lookup is done on the log, otherwise, the lookup is done on the hashmap.
    // If it's not in the write-set, return lval.
    inline uint64_t lookupAddr(const void* addr, uint64_t lval) {
        if (numStores < MAX_ARRAY_LOOKUP) {
            // Lookup in array
            for (unsigned int idx = 0; idx < numStores; idx++) {
                if (log[idx].addr == addr) return log[idx].val;
            }
        } else {
            // Lookup in hashmap
            const uint64_t hashAddr = hash(addr);
            WriteSetEntry* be = buckets[hashAddr];
            if (be < &log[numStores] && hash(be->addr) == hashAddr) {
                while (be != nullptr) {
                    if (be->addr == addr) return be->val;
                    be = be->next;
                }
            }
        }
        return lval;
    }

    // Assignment operator, used when making a copy of a WriteSet to help another thread
    WriteSet& operator = (const WriteSet &other) {
        numStores = other.numStores;
        for (uint64_t i = 0; i < numStores; i++) log[i] = other.log[i];
        return *this;
    }

    // Applies all entries in the log as DCASes.
    // Seq must match for DCAS to succeed. This method is on the "hot-path".
    inline void apply(uint64_t seq, const int tid) {
        for (uint64_t i = 0; i < numStores; i++) {
            // Use an heuristic to give each thread 8 consecutive DCAS to apply
            WriteSetEntry& e = log[(tid*8 + i) % numStores];
            tmtypebase<uint64_t>* tmte = (tmtypebase<uint64_t>*)e.addr;
            uint64_t lval = tmte->val.load(std::memory_order_acquire);
            uint64_t lseq = tmte->seq.load(std::memory_order_acquire);
            if (lseq < seq) DCAS((uint64_t*)e.addr, lval, lseq, e.val, seq);
        }
    }
};


// Forward declaration
struct OpData;
// This is used by addOrReplace() to know which OpDesc instance to use for the current transaction
extern thread_local OpData* tl_opdata;


// Its purpose is to hold thread-local data
struct OpData {
    uint64_t      curTx {0};              // Used during a transaction to keep the value of curTx read in beginTx() (owner thread only)
    uint64_t      nestedTrans {0};        // Thread-local: Number of nested transactions
    PWriteSet*    pWriteSet {nullptr};    // Pointer to the redo log in persistent memory
    uint64_t      padding[16-3];          // Padding to avoid false-sharing in nestedTrans and curTx
};


// Used to identify aborted transactions
struct AbortedTx {};
static constexpr AbortedTx AbortedTxException {};

class OneFileLF;
extern OneFileLF gOFLF;


/**
 * <h1> One-File PTM (Lock-Free) </h1>
 *
 * One-File is a Persistent Software Transacional Memory with lock-free progress, meant to
 * implement lock-free data structures. It has integrated lock-free memory
 * reclamation using an optimistic memory scheme
 *
 * OF is a word-based PTM and it uses double-compare-and-swap (DCAS).
 *
 * Right now it has several limitations, some will be fixed in the future, some may be hard limitations of this approach:
 * - We can't have stack allocated tmtype<> variables. For example, we can't created inside a transaction "tmtpye<uint64_t> tmp = a;",
 *   it will give weird errors because of stack allocation.
 * - We need DCAS but it can be emulated with LL/SC or even with single-word CAS
 *   if we do redirection to a (lock-free) pool with SeqPtrs;
 */
class OneFileLF {
private:
    static const bool                    debug = false;
    OpData                              *opData;
    int                                  fd {-1};

public:
    EsLoco<tmtype>                       esloco {};
    PMetadata*                           pmd {nullptr};
    std::atomic<uint64_t>*               curTx {nullptr};              // Pointer to persistent memory location of curTx (it's in PMetadata)
    WriteSet*                            writeSets;                    // Two write-sets for each thread

    OneFileLF() {
        opData = new OpData[REGISTRY_MAX_THREADS];
        writeSets = new WriteSet[REGISTRY_MAX_THREADS];
        mapPersistentRegion(PFILE_NAME, PREGION_ADDR, PREGION_SIZE);
    }

    ~OneFileLF() {
        delete[] opData;
        delete[] writeSets;
    }

    static std::string className() { return "OneFilePTM-LF"; }

    void mapPersistentRegion(const char* filename, uint8_t* regionAddr, const uint64_t regionSize) {
        // Check that the header with the logs leaves at least half the memory available to the user
        if (sizeof(PMetadata) > regionSize/2) {
            printf("ERROR: the size of the logs in persistent memory is so large that it takes more than half the whole persistent memory\n");
            printf("Please reduce some of the settings in OneFilePTMLF.hpp and try again\n");
            assert(false);
        }
        bool reuseRegion = false;
        // Check if the file already exists or not
        struct stat buf;
        if (stat(filename, &buf) == 0) {
            // File exists
            fd = open(filename, O_RDWR|O_CREAT, 0755);
            assert(fd >= 0);
            reuseRegion = true;
        } else {
            // File doesn't exist
            fd = open(filename, O_RDWR|O_CREAT, 0755);
            assert(fd >= 0);
            if (lseek(fd, regionSize-1, SEEK_SET) == -1) {
                perror("lseek() error");
            }
            if (write(fd, "", 1) == -1) {
                perror("write() error");
            }
        }
        // mmap() memory range
        void* got_addr = (uint8_t *)mmap(regionAddr, regionSize, (PROT_READ | PROT_WRITE), MAP_SHARED, fd, 0);
        if (got_addr == MAP_FAILED || got_addr != regionAddr) {
            printf("got_addr = %p  instead of %p\n", got_addr, regionAddr);
            perror("ERROR: mmap() is not working !!! ");
            assert(false);
        }
        // Check if the header is consistent and only then can we attempt to re-use, otherwise we clear everything that's there
        pmd = reinterpret_cast<PMetadata*>(regionAddr);
        if (reuseRegion) reuseRegion = (pmd->id == PMetadata::MAGIC_ID);
        // Map pieces of persistent Metadata to pointers in volatile memory
        for (uint64_t i = 0; i < REGISTRY_MAX_THREADS; i++) opData[i].pWriteSet = &(pmd->plog[i]);
        curTx = &(pmd->curTx);
        // If the file has just been created or if the header is not consistent, clear everything.
        // Otherwise, re-use and recover to a consistent state.
        if (reuseRegion) {
            esloco.init(regionAddr+sizeof(PMetadata), regionSize-sizeof(PMetadata), false);
            //recover(); // Not needed on x86
        } else {
            // Start by resetting all tmtypes::seq in the metadata region
            std::memset(regionAddr, 0, sizeof(PMetadata));
            new (regionAddr) PMetadata();
            esloco.init(regionAddr+sizeof(PMetadata), regionSize-sizeof(PMetadata), true);
            PFENCE();
            pmd->id = PMetadata::MAGIC_ID;
            PWB(&pmd->id);
            PFENCE();
        }
    }

    // Progress Condition: lock-free
    // The while-loop retarts only if there was at least one other thread completing a transaction
    void beginTx(OpData& myopd, const int tid) {
        tl_is_read_only = true;
        while (true) {
            myopd.curTx = curTx->load(std::memory_order_acquire);
            helpApply(myopd.curTx, tid);
            // Reset the write-set after (possibly) helping another transaction complete
            writeSets[tid].numStores = 0;
            // Start over if there is already a new transaction
            if (myopd.curTx == curTx->load(std::memory_order_acquire)) return;
        }
    }

    // Progress condition: wait-free population-oblivious
    // Attempts to publish our write-set (commit the transaction) and then applies the write-set.
    // Returns true if my transaction was committed.
    inline bool commitTx(OpData& myopd, const int tid) {
        // If it's a read-only transaction, then commit immediately
        if (writeSets[tid].numStores == 0) return true;
        // Give up if the curTx has changed sinced our transaction started
        if (myopd.curTx != curTx->load(std::memory_order_acquire)) return false;
        // Move our request to OPEN, using the sequence of the previous transaction +1
        const uint64_t seq = trans2seq(myopd.curTx);
        const uint64_t newTx = seqidx2trans(seq+1,tid);
        myopd.pWriteSet->request.store(newTx, std::memory_order_release);
        // Copy the write-set to persistent memory and flush it
        writeSets[tid].persistAndFlushLog(myopd.pWriteSet);
        // Attempt to CAS curTx to our OpDesc instance (tid) incrementing the seq in it
        uint64_t lcurTx = myopd.curTx;
        if (debug) printf("tid=%i  attempting CAS on curTx from (%ld,%ld) to (%ld,%ld)\n", tid, trans2seq(lcurTx), trans2idx(lcurTx), seq+1, (uint64_t)tid);
        if (!curTx->compare_exchange_strong(lcurTx, newTx)) return false;
        PWB(curTx);
        // Execute each store in the write-set using DCAS() and close the request
        helpApply(newTx, tid);
        // We should need a PSYNC() here to provide durable linearizabilty, but the CAS of the state in helpApply() acts as a PSYNC() (on x86).
        if (debug) printf("Committed transaction (%ld,%ld) with %ld stores\n", seq+1, (uint64_t)tid, writeSets[tid].numStores);
        return true;
    }

    // Same as beginTx/endTx transaction, but with lambdas, and it handles AbortedTx exceptions
    template<typename R, typename F> R transaction(F&& func) {
        const int tid = ThreadRegistry::getTID();
        OpData& myopd = opData[tid];
        if (myopd.nestedTrans > 0) return func();
        ++myopd.nestedTrans;
        tl_opdata = &myopd;
        R retval {};
        while (true) {
            beginTx(myopd, tid);
            try {
                retval = func();
            } catch (AbortedTx&) {
                continue;
            }
            if (commitTx(myopd, tid)) break;
        }
        tl_opdata = nullptr;
        --myopd.nestedTrans;
        return retval;
    }

    // Same as above, but returns void
    template<typename F> void transaction(F&& func) {
        const int tid = ThreadRegistry::getTID();
        OpData& myopd = opData[tid];
        if (myopd.nestedTrans > 0) {
            func();
            return;
        }
        ++myopd.nestedTrans;
        tl_opdata = &myopd;
        while (true) {
            beginTx(myopd, tid);
            try {
                func();
            } catch (AbortedTx&) {
                continue;
            }
            if (commitTx(myopd, tid)) break;
        }
        tl_opdata = nullptr;
        --myopd.nestedTrans;
    }

    // It's silly that these have to be static, but we need them for the (SPS) benchmarks due to templatization
    template<typename R, typename F> static R updateTx(F&& func) { return gOFLF.transaction<R>(func); }
    template<typename R, typename F> static R readTx(F&& func) { return gOFLF.transaction<R>(func); }
    template<typename F> static void updateTx(F&& func) { gOFLF.transaction(func); }
    template<typename F> static void readTx(F&& func) { gOFLF.transaction(func); }

    template <typename T, typename... Args> static T* tmNew(Args&&... args) {
    //template <typename T> static T* tmNew() {
        T* ptr = (T*)gOFLF.esloco.malloc(sizeof(T));
        //new (ptr) T;  // new placement
        new (ptr) T(std::forward<Args>(args)...);
        return ptr;
    }

    template<typename T> static void tmDelete(T* obj) {
        if (obj == nullptr) return;
        obj->~T(); // Execute destructor as part of the current transaction
        tmFree(obj);
    }

    static void* tmMalloc(size_t size) {
        if (tl_opdata == nullptr) {
            printf("ERROR: Can not allocate outside a transaction\n");
            return nullptr;
        }
        void* obj = gOFLF.esloco.malloc(size);
        return obj;
    }

    static void tmFree(void* obj) {
        if (obj == nullptr) return;
        if (tl_opdata == nullptr) {
            printf("ERROR: Can not de-allocate outside a transaction\n");
            return;
        }
        gOFLF.esloco.free(obj);
    }

    static void* pmalloc(size_t size) {
        return gOFLF.esloco.malloc(size);
    }

    static void pfree(void* obj) {
        if (obj == nullptr) return;
        gOFLF.esloco.free(obj);
    }

    template <typename T> static inline T* get_object(int idx) {
        tmtype<T*>* ptr = (tmtype<T*>*)&(gOFLF.pmd->rootPtrs[idx]);
        return ptr->pload();
    }

    template <typename T> static inline void put_object(int idx, T* obj) {
        tmtype<T*>* ptr = (tmtype<T*>*)&(gOFLF.pmd->rootPtrs[idx]);
        ptr->pstore(obj);
    }

private:
    // Progress condition: wait-free population oblivious
    inline void helpApply(uint64_t lcurTx, const int tid) {
        const uint64_t idx = trans2idx(lcurTx);
        const uint64_t seq = trans2seq(lcurTx);
        OpData& opd = opData[idx];
        // Nothing to apply unless the request matches the curTx
        if (lcurTx != opd.pWriteSet->request.load(std::memory_order_acquire)) return;
        if (idx != tid) {
            // Make a copy of the write-set and check if it is consistent
            writeSets[tid] = writeSets[idx];
            std::atomic_thread_fence(std::memory_order_acquire);
            if (lcurTx != curTx->load()) return;
            if (lcurTx != opd.pWriteSet->request.load(std::memory_order_acquire)) return;
        }
        if (debug) printf("Applying %ld stores in write-set\n", writeSets[tid].numStores);
        writeSets[tid].apply(seq, tid);
        writeSets[tid].flushModifications();
        if (opd.pWriteSet->request.load() == lcurTx) {
            const uint64_t newReq = seqidx2trans(seq+1,idx);
            opd.pWriteSet->request.compare_exchange_strong(lcurTx, newReq);
        }
    }

    // Upon restart, re-applies the last transaction, so as to guarantee that
    // we have a consistent state in persistent memory.
    // This is not used on x86 because the DCAS has atomicity writting to persistent memory.
    void recover() {
        uint64_t lcurTx = curTx->load(std::memory_order_acquire);
        opData[trans2idx(lcurTx)].pWriteSet->applyFromRecover();
        PSYNC();
    }
};


// T is typically a pointer to a node, but it can be integers or other stuff, as long as it fits in 64 bits
template<typename T> struct tmtype : tmtypebase<T> {
    tmtype() { }

    tmtype(T initVal) { pstore(initVal); }

    // Casting operator
    operator T() { return pload(); }

    // Prefix increment operator: ++x
    void operator++ () { pstore(pload()+1); }
    // Prefix decrement operator: --x
    void operator-- () { pstore(pload()-1); }
    void operator++ (int) { pstore(pload()+1); }
    void operator-- (int) { pstore(pload()-1); }

    // Equals operator: first downcast to T and then compare
    bool operator == (const T& otherval) const { return pload() == otherval; }

    // Difference operator: first downcast to T and then compare
    bool operator != (const T& otherval) const { return pload() != otherval; }

    // Relational operators
    bool operator < (const T& rhs) { return pload() < rhs; }
    bool operator > (const T& rhs) { return pload() > rhs; }
    bool operator <= (const T& rhs) { return pload() <= rhs; }
    bool operator >= (const T& rhs) { return pload() >= rhs; }

    // Operator arrow ->
    T operator->() { return pload(); }

    // Copy constructor
    tmtype<T>(const tmtype<T>& other) { pstore(other.pload()); }

    // Assignment operator from an tmtype
    tmtype<T>& operator=(const tmtype<T>& other) {
        pstore(other.pload());
        return *this;
    }

    // Assignment operator from a value
    tmtype<T>& operator=(T value) {
        pstore(value);
        return *this;
    }

    // Operator &
    T* operator&() {
        return (T*)this;
    }

    // Meant to be called when know we're the only ones touching
    // these contents, for example, in the constructor of an object, before
    // making the object visible to other threads.
    inline void isolated_store(T newVal) {
        tmtypebase<T>::val.store((uint64_t)newVal, std::memory_order_relaxed);
    }

    // We don't need to check curTx here because we're not de-referencing
    // the val. It's only after a load() that the val may be de-referenced
    // (in user code), therefore we do the check on load() only.
    inline void pstore(T newVal) {
        OpData* const myopd = tl_opdata;
        if (myopd == nullptr) { // Looks like we're outside a transaction
            tmtypebase<T>::val.store((uint64_t)newVal, std::memory_order_relaxed);
        } else {
            gOFLF.writeSets[tl_tcico.tid].addOrReplace(this, (uint64_t)newVal);
        }
    }

    // We have to check if there is a new ongoing transaction and if so, abort
    // this execution immediately for two reasons:
    // 1. Memory Reclamation: the val we're returning may be a pointer to an
    // object that has since been retired and deleted, therefore we can't allow
    // user code to de-reference it;
    // 2. Invariant Conservation: The val we're reading may be from a newer
    // transaction, which implies that it may break an invariant in the user code.
    // See examples of invariant breaking in this post:
    // http://concurrencyfreaks.com/2013/11/stampedlocktryoptimisticread-and.html
    inline T pload() const {
        T lval = (T)tmtypebase<T>::val.load(std::memory_order_acquire);
        OpData* const myopd = tl_opdata;
        if (myopd == nullptr) return lval;
        if ((uint8_t*)this < PREGION_ADDR || (uint8_t*)this > PREGION_END) return lval;
        uint64_t lseq = tmtypebase<T>::seq.load(std::memory_order_acquire);
        if (lseq > trans2seq(myopd->curTx)) throw AbortedTxException;
        if (tl_is_read_only) return lval;
        return (T)gOFLF.writeSets[tl_tcico.tid].lookupAddr(this, (uint64_t)lval);
    }
};


//
// Wrapper methods to the global TM instance. The user should use these:
//
template<typename R, typename F> static R updateTx(F&& func) { return gOFLF.transaction<R>(func); }
template<typename R, typename F> static R readTx(F&& func) { return gOFLF.transaction<R>(func); }
template<typename F> static void updateTx(F&& func) { gOFLF.transaction(func); }
template<typename F> static void readTx(F&& func) { gOFLF.transaction(func); }
template<typename T, typename... Args> T* tmNew(Args&&... args) { return OneFileLF::tmNew<T>(std::forward<Args>(args)...); }
template<typename T> void tmDelete(T* obj) { OneFileLF::tmDelete<T>(obj); }
template<typename T> static T* get_object(int idx) { return OneFileLF::get_object<T>(idx); }
template<typename T> static void put_object(int idx, T* obj) { OneFileLF::put_object<T>(idx, obj); }
inline static void* tmMalloc(size_t size) { return OneFileLF::tmMalloc(size); }
inline static void tmFree(void* obj) { OneFileLF::tmFree(obj); }


//
// Place these in a .cpp if you include this header from multiple files (compilation units)
//
OneFileLF gOFLF {};
thread_local OpData* tl_opdata {nullptr};
// Global/singleton to hold all the thread registry functionality
ThreadRegistry gThreadRegistry {};
// During a transaction, this is true up until the first store()
thread_local bool tl_is_read_only {false};
// This is where every thread stores the tid it has been assigned when it calls getTID() for the first time.
// When the thread dies, the destructor of ThreadCheckInCheckOut will be called and de-register the thread.
thread_local ThreadCheckInCheckOut tl_tcico {};
// Helper function for thread de-registration
void thread_registry_deregister_thread(const int tid) {
    gThreadRegistry.deregister_thread(tid);
}

}
#endif /* _PERSISTENT_ONE_FILE_LOCK_FREE_TRANSACTIONAL_MEMORY_H_ */


================================================
FILE: ptms/OneFilePTMLFMultiProcess.hpp
================================================
/*
 * Copyright 2017-2019
 *   Andreia Correia <andreia.veiga@unine.ch>
 *   Pedro Ramalhete <pramalhe@gmail.com>
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Nachshon Cohen <nachshonc@gmail.com>
 *
 * This work is published under the MIT license. See LICENSE.txt
 */
#ifndef _PERSISTENT_ONE_FILE_LOCK_FREE_MULTI_PROCESS_TRANSACTIONAL_MEMORY_H_
#define _PERSISTENT_ONE_FILE_LOCK_FREE_MULTI_PROCESS_TRANSACTIONAL_MEMORY_H_

#include <atomic>
#include <cassert>
#include <iostream>
#include <vector>
#include <functional>
#include <cstring>
#include <sys/mman.h>   // Needed if we use mmap()
#include <sys/types.h>  // Needed by open() and close()
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>     // Needed by close()
#include <pthread.h>    // Needed by robust mutexes

// Please keep this file in sync (as much as possible) with stms/OneFileLF.hpp

// Macros needed for persistence
#ifdef PWB_IS_CLFLUSH
  /*
   * More info at http://elixir.free-electrons.com/linux/latest/source/arch/x86/include/asm/special_insns.h#L213
   * Intel programming manual at https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
   * Use these for Broadwell CPUs (cervino server)
   */
  #define PWB(addr)              __asm__ volatile("clflush (%0)" :: "r" (addr) : "memory")                  // Broadwell only works with this.
  #define PFENCE()               {}                                                                         // No ordering fences needed for CLFLUSH (section 7.4.6 of Intel manual)
  #define PSYNC()                {}                                                                         // For durability it's not obvious, but CLFLUSH seems to be enough, and PMDK uses the same approach
#elif PWB_IS_CLWB
  /* Use this for CPUs that support clwb, such as the SkyLake SP series (c5 compute intensive instances in AWS are an example of it) */
  #define PWB(addr)              __asm__ volatile(".byte 0x66; xsaveopt %0" : "+m" (*(volatile char *)(addr)))  // clwb() only for Ice Lake onwards
  #define PFENCE()               __asm__ volatile("sfence" : : : "memory")
  #define PSYNC()                __asm__ volatile("sfence" : : : "memory")
#elif PWB_IS_NOP
  /* pwbs are not needed for shared memory persistency (i.e. persistency across process failure) */
  #define PWB(addr)              {}
  #define PFENCE()               __asm__ volatile("sfence" : : : "memory")
  #define PSYNC()                __asm__ volatile("sfence" : : : "memory")
#elif PWB_IS_CLFLUSHOPT
  /* Use this for CPUs that support clflushopt, which is most recent x86 */
  #define PWB(addr)              __asm__ volatile(".byte 0x66; clflush %0" : "+m" (*(volatile char *)(addr)))    // clflushopt (Kaby Lake)
  #define PFENCE()               __asm__ volatile("sfence" : : : "memory")
  #define PSYNC()                __asm__ volatile("sfence" : : : "memory")
#else
#error "You must define what PWB is. Choose PWB_IS_CLFLUSHOPT if you don't know what your CPU is capable of"
#endif


/*
 * Differences between POneFileLF and the non-persistent OneFileLF:
 * - A secondary redo log (PWriteSet) is placed in persistent memory before attempting a 'commit'.
 * - The set of the request in helpApply() is always done with a CAS to enforce ordering on the PWBs of the DCAS;
 * - The persistent logs are allocated in PM, same as all user allocations from tmNew(), 'curTx', and 'request'
 */
namespace onefileptmlfmp {

//
// User configurable variables.
// Feel free to change these if you need larger transactions, more allocations per transacation, or more threads.
//

// Maximum number of registered threads that can execute transactions
static const int REGISTRY_MAX_THREADS = 128;
// Maximum number of stores in the WriteSet per transaction
static const uint64_t TX_MAX_STORES = 40*1024;
// Number of buckets in the hashmap of the WriteSet.
static const uint64_t HASH_BUCKETS = 2048;

// Persistent-specific configuration
// Name of persistent file mapping
static const char * PFILE_NAME = "/dev/shm/ponefilelfmp_shared";
// Start address of mapped persistent memory
static uint8_t* PREGION_ADDR = (uint8_t*)0x70ea00000000;
// Size of persistent memory. Part of it will be used by the redo logs
static const uint64_t PREGION_SIZE = 1024*1024*1024ULL;   // 1 GB by default
// End address of mapped persistent memory
static uint8_t* PREGION_END = (PREGION_ADDR+PREGION_SIZE);
// Maximum number of root pointers available for the user
static const uint64_t MAX_ROOT_POINTERS = 100;


// DCAS / CAS2 macro
#define DCAS(ptr, o1, o2, n1, n2)                               \
({                                                              \
    char __ret;                                                 \
    __typeof__(o2) __junk;                                      \
    __typeof__(*(ptr)) __old1 = (o1);                           \
    __typeof__(o2) __old2 = (o2);                               \
    __typeof__(*(ptr)) __new1 = (n1);                           \
    __typeof__(o2) __new2 = (n2);                               \
    asm volatile("lock cmpxchg16b %2;setz %1"                   \
                   : "=d"(__junk), "=a"(__ret), "+m" (*ptr)     \
                   : "b"(__new1), "c"(__new2),                  \
                     "a"(__old1), "d"(__old2));                 \
    __ret; })


// Functions to convert between a transaction identifier (uint64_t) and a pair of {sequence,index}
static inline uint64_t seqidx2trans(uint64_t seq, uint64_t idx) {
    return (seq << 10) | idx;
}
static inline uint64_t trans2seq(uint64_t trans) {
    return trans >> 10;
}
static inline uint64_t trans2idx(uint64_t trans) {
    return trans & 0x3FF; // 10 bits
}

// Flush each cache line in a range
static inline void flushFromTo(void* from, void* to) noexcept {
    const uint64_t cache_line_size = 64;
    uint8_t* ptr = (uint8_t*)(((uint64_t)from) & (~(cache_line_size-1)));
    for (; ptr < (uint8_t*)to; ptr += cache_line_size) PWB(ptr);
}


//
// Thread Registry stuff
//
extern void thread_registry_deregister_thread(const int tid);

// An helper class to do the checkin and checkout of the thread registry
struct ThreadCheckInCheckOut {
    static const int NOT_ASSIGNED = -1;
    int tid { NOT_ASSIGNED };
    ~ThreadCheckInCheckOut() {
        if (tid == NOT_ASSIGNED) return;
        thread_registry_deregister_thread(tid);
    }
};

extern thread_local ThreadCheckInCheckOut tl_tcico;

// Forward declaration of global/singleton instance
class ThreadRegistry;
extern ThreadRegistry gThreadRegistry;

/*
 * <h1> Registry for threads </h1>
 *
 * This is singleton type class that allows assignement of a unique id to each thread.
 * The first time a thread calls ThreadRegistry::getTID() it will allocate a free slot in 'usedTID[]'.
 * This tid wil be saved in a thread-local variable of the type ThreadCheckInCheckOut which
 * upon destruction of the thread will call the destructor of ThreadCheckInCheckOut and free the
 * corresponding slot to be used by a later thread.
 */
class ThreadRegistry {
private:
    pthread_mutex_t*        usedTID;   // (pointer to) Which TIDs are in use by threads
    std::atomic<int64_t>*   maxTid;    // (pointer to) Highest TID (+1) in use by threads

public:
    void initRobustMutex(pthread_mutex_t* rmutex) {
        pthread_mutexattr_t ma;
        pthread_mutexattr_init(&ma);
        pthread_mutexattr_setrobust_np(&ma, PTHREAD_MUTEX_ROBUST_NP);
        pthread_mutexattr_setpshared(&ma, PTHREAD_PROCESS_SHARED);
        pthread_mutexattr_settype(&ma, PTHREAD_MUTEX_ADAPTIVE_NP);
        pthread_mutex_init(rmutex, &ma);
        pthread_mutexattr_destroy(&ma);
    }

    void init(pthread_mutex_t* usedTID, std::atomic<int64_t>* maxTid, bool clearPool) {
        this->usedTID = usedTID;
        this->maxTid = maxTid;
        if (clearPool) {
            for (int it = 0; it < REGISTRY_MAX_THREADS; it++) {
                initRobustMutex(&usedTID[it]);
            }
        }
    }

    // Progress condition: wait-free bounded (by the number of threads)
    int register_thread_new(void) {
        for (int tid = 0; tid < REGISTRY_MAX_THREADS; tid++) {
            if (pthread_mutex_trylock(&usedTID[tid]) == EBUSY) continue;
            // Increase the current maximum to cover our thread id
            int64_t curMax = maxTid->load();
            while (curMax <= tid) {
                maxTid->compare_exchange_strong(curMax, tid+1);
                curMax = maxTid->load();
            }
            tl_tcico.tid = tid;
            //printf("got thread id %d\n", tid);
            return tid;
        }
        std::cout << "ERROR: Too many threads, registry can only hold " << REGISTRY_MAX_THREADS << " threads\n";
        assert(false);
    }

    // Progress condition: wait-free population oblivious
    inline void deregister_thread(const int tid) {
        pthread_mutex_unlock(&usedTID[tid]);
    }

    // Progress condition: wait-free population oblivious
    static inline uint64_t getMaxThreads(void) {
        return gThreadRegistry.maxTid->load(std::memory_order_acquire);
    }

    // Progress condition: wait-free bounded (by the number of threads)
    static inline int getTID(void) {
        int tid = tl_tcico.tid;
        if (tid != ThreadCheckInCheckOut::NOT_ASSIGNED) return tid;
        return gThreadRegistry.register_thread_new();
    }
};


// Forward declaration needed by EsLoco
template<typename T> struct tmtype;

// We need to split the contents from the methods due to compilation dependencies
template<typename T> struct tmtypebase {
    // Stores the actual value as an atomic
    std::atomic<uint64_t>  val;
    // Lets hope this comes immediately after 'val' in memory mapping, otherwise the DCAS() will fail
    std::atomic<uint64_t>  seq;
};


/*
 * EsLoco is an Extremely Simple memory aLOCatOr
 *
 * It is based on intrusive singly-linked lists (a free-list), one for each power of two size.
 * All blocks are powers of two, the smallest size enough to contain the desired user data plus the block header.
 * There is an array named 'freelists' where each entry is a pointer to the head of a stack for that respective block size.
 * Blocks are allocated in powers of 2 of words (64bit words).
 * Each block has an header with two words: the size of the node (in words), the pointer to the next node.
 * The minimum block size is 4 words, with 2 for the header and 2 for the user.
 * When there is no suitable block in the freelist, it will create a new block from the remaining pool.
 *
 * EsLoco was designed for usage in PTMs but it doesn't have to be used only for that.
 * Average number of stores for an allocation is 1.
 * Average number of stores for a de-allocation is 2.
 *
 * Memory layout:
 * ------------------------------------------------------------------------
 * | poolTop | freelists[0] ... freelists[61] | ... allocated objects ... |
 * ------------------------------------------------------------------------
 */
template <template <typename> class P>
class EsLoco {
private:
    struct block {
        P<block*>   next;   // Pointer to next block in free-list (when block is in free-list)
        P<uint64_t> size;   // Exponent of power of two of the size of this block in bytes.
    };

    const bool debugOn = false;

    // Volatile data
    uint8_t* poolAddr {nullptr};
    uint64_t poolSize {0};

    // Pointer to array of persistent heads of free-list
    block* freelists {nullptr};
    // Volatile pointer to persistent pointer to last unused address (the top of the pool)
    P<uint8_t*>* poolTop {nullptr};

    // Number of blocks in the freelists array.
    // Each entry corresponds to an exponent of the block size: 2^4, 2^5, 2^6... 2^40
    static const int kMaxBlockSize = 50; // 1024 TB of memory should be enough

    // For powers of 2, returns the highest bit, otherwise, returns the next highest bit
    uint64_t highestBit(uint64_t val) {
        uint64_t b = 0;
        while ((val >> (b+1)) != 0) b++;
        if (val > (1ULL << b)) return b+1;
        return b;
    }

    uint8_t* aligned(uint8_t* addr) {
        return (uint8_t*)((size_t)addr & (~0x3FULL)) + 128;
    }

public:
    void init(void* addressOfMemoryPool, size_t sizeOfMemoryPool, bool clearPool=true) {
        // Align the base address of the memory pool
        poolAddr = aligned((uint8_t*)addressOfMemoryPool);
        poolSize = sizeOfMemoryPool + (uint8_t*)addressOfMemoryPool - poolAddr;
        // The first thing in the pool is a pointer to the top of the pool
        poolTop = (P<uint8_t*>*)poolAddr;
        // The second thing in the pool is the array of freelists
        freelists = (block*)(poolAddr + sizeof(*poolTop));
        if (clearPool) {
            std::memset(poolAddr, 0, poolSize);
            for (int i = 0; i < kMaxBlockSize; i++) freelists[i].next.pstore(nullptr);
            // The size of the freelists array in bytes is sizeof(block)*kMaxBlockSize
            // Align to cache line boundary (DCAS needs 16 byte alignment)
            poolTop->pstore(aligned(poolAddr + sizeof(*poolTop) + sizeof(block)*kMaxBlockSize));
        }
        if (debugOn) printf("Starting EsLoco with poolAddr=%p and poolSize=%ld, up to %p\n", poolAddr, poolSize, poolAddr+poolSize);
    }

    // Resets the metadata of the allocator back to its defaults
    void reset() {
        std::memset(poolAddr, 0, sizeof(block)*kMaxBlockSize);
        poolTop->pstore(nullptr);
    }

    // Returns the number of bytes that may (or may not) have allocated objects, from the base address to the top address
    uint64_t getUsedSize() {
        return poolTop->pload() - poolAddr;
    }

    // Takes the desired size of the object in bytes.
    // Returns pointer to memory in pool, or nullptr.
    // Does on average 1 store to persistent memory when re-utilizing blocks.
    void* malloc(size_t size) {
        P<uint8_t*>* top = (P<uint8_t*>*)(((uint8_t*)poolTop));
        block* flists = (block*)(((uint8_t*)freelists));
        // Adjust size to nearest (highest) power of 2
        uint64_t bsize = highestBit(size + sizeof(block));
        if (debugOn) printf("malloc(%ld) requested,  block size exponent = %ld\n", size, bsize);
        block* myblock = nullptr;
        // Check if there is a block of that size in the corresponding freelist
        if (flists[bsize].next.pload() != nullptr) {
            if (debugOn) printf("Found available block in freelist\n");
            // Unlink block
            myblock = flists[bsize].next;
            flists[bsize].next = myblock->next;          // pstore()
        } else {
            if (debugOn) printf("Creating new block from top, currently at %p\n", top->pload());
            // Couldn't find a suitable block, get one from the top of the pool if there is one available
            if (top->pload() + (1<<bsize) > poolSize + poolAddr) {
                printf("EsLoco: Out of memory for %ld bytes allocation\n", size);
                return nullptr;
            }
            myblock = (block*)top->pload();
            top->pstore(top->pload() + (1<<bsize));      // pstore()
            myblock->size = bsize;                       // pstore()
        }
        if (debugOn) printf("returning ptr = %p\n", (void*)((uint8_t*)myblock + sizeof(block)));
        // Return the block, minus the header
        return (void*)((uint8_t*)myblock + sizeof(block));
    }

    // Takes a pointer to an object and puts the block on the free-list.
    // Does on average 2 stores to persistent memory.
    void free(void* ptr) {
        if (ptr == nullptr) return;
        block* flists = (block*)(((uint8_t*)freelists));
        block* myblock = (block*)((uint8_t*)ptr - sizeof(block));
        if (debugOn) printf("free(%p)  block size exponent = %ld\n", ptr, myblock->size.pload());
        // Insert the block in the corresponding freelist
        myblock->next = flists[myblock->size].next;      // pstore()
        flists[myblock->size].next = myblock;            // pstore()
    }
};


// Needed by our benchmarks
struct tmbase {
};


// An entry in the persistent write-set
struct PWriteSetEntry {
    void*    addr;  // Address of value+sequence to change
    uint64_t val;   // Desired value to change to
};


// The persistent write-set (undo log)
struct PWriteSet {
    uint64_t              numStores {0};          // Number of stores in the writeSet for the current transaction
    std::atomic<uint64_t> request {0};            // Can be moved to CLOSED by other threads, using a CAS
    PWriteSetEntry        plog[TX_MAX_STORES];    // Redo log of stores

    // Applies all entries in the log. Called only by recover() which is non-concurrent.
    void applyFromRecover() {
        // We're assuming that 'val' is the size of a uint64_t
        for (uint64_t i = 0; i < numStores; i++) {
            *((uint64_t*)plog[i].addr) = plog[i].val;
            PWB(plog[i].addr);
        }
    }
};


// A single entry in the write-set
struct WriteSetEntry {
    void*          addr {nullptr};  // Address of value+sequence to change
    uint64_t       val;             // Desired value to change to
    WriteSetEntry* next {nullptr};  // Pointer to next node in the (intrusive) hash map
};

extern thread_local bool tl_is_read_only;


// The write-set is a log of the words modified during the transaction.
// This log is an array with an intrusive hashmap of size HASH_BUCKETS.
struct WriteSet {
    static const uint64_t MAX_ARRAY_LOOKUP = 30;  // Beyond this, it seems to be faster to use the hashmap
    WriteSetEntry         log[TX_MAX_STORES];     // Redo log of stores
    uint64_t              numStores {0};          // Number of stores in the writeSet for the current transaction
    WriteSetEntry*        buckets[HASH_BUCKETS];  // Intrusive HashMap for fast lookup in large(r) transactions

    WriteSet() {
        numStores = 0;
        for (int i = 0; i < HASH_BUCKETS; i++) buckets[i] = &log[TX_MAX_STORES-1];
    }

    // Copies the current write set to persistent memory
    inline void persistAndFlushLog(PWriteSet* const pwset) {
        for (uint64_t i = 0; i < numStores; i++) {
            pwset->plog[i].addr = log[i].addr;
            pwset->plog[i].val = log[i].val;
        }
        pwset->numStores = numStores;
        // Flush the log and the numStores variable
        flushFromTo(&pwset->numStores, &pwset->plog[numStores+1]);
    }

    // Uses the log to flush the modifications to NVM.
    // We assume tmtype does not cross cache line boundaries.
    inline void flushModifications() {
        for (uint64_t i = 0; i < numStores; i++) PWB(log[i].addr);
    }

    // Each address on a different bucket
    inline uint64_t hash(const void* addr) const {
        return (((uint64_t)addr) >> 3) % HASH_BUCKETS;
    }

    // Adds a modification to the redo log
    inline void addOrReplace(void* addr, uint64_t val) {
        if (tl_is_read_only) tl_is_read_only = false;
        const uint64_t hashAddr = hash(addr);
        if (numStores < MAX_ARRAY_LOOKUP) {
            // Lookup in array
            for (unsigned int idx = 0; idx < numStores; idx++) {
                if (log[idx].addr == addr) {
                    log[idx].val = val;
                    return;
                }
            }
        } else {
            // Lookup in hashmap
            WriteSetEntry* be = buckets[hashAddr];
            if (be < &log[numStores] && hash(be->addr) == hashAddr) {
                while (be != nullptr) {
                    if (be->addr == addr) {
                        be->val = val;
                        return;
                    }
                    be = be->next;
                }
            }
        }
        // Add to array
        WriteSetEntry* e = &log[numStores++];
        assert(numStores < TX_MAX_STORES);
        e->addr = addr;
        e->val = val;
        // Add to hashmap
        WriteSetEntry* be = buckets[hashAddr];
        // Clear if entry is from previous tx
        e->next = (be < e && hash(be->addr) == hashAddr) ? be : nullptr;
        buckets[hashAddr] = e;
    }

    // Does a lookup on the WriteSet for an addr.
    // If the numStores is lower than MAX_ARRAY_LOOKUP, the lookup is done on the log, otherwise, the lookup is done on the hashmap.
    // If it's not in the write-set, return lval.
    inline uint64_t lookupAddr(const void* addr, uint64_t lval) {
        if (numStores < MAX_ARRAY_LOOKUP) {
            // Lookup in array
            for (unsigned int idx = 0; idx < numStores; idx++) {
                if (log[idx].addr == addr) return log[idx].val;
            }
        } else {
            // Lookup in hashmap
            const uint64_t hashAddr = hash(addr);
            WriteSetEntry* be = buckets[hashAddr];
            if (be < &log[numStores] && hash(be->addr) == hashAddr) {
                while (be != nullptr) {
                    if (be->addr == addr) return be->val;
                    be = be->next;
                }
            }
        }
        return lval;
    }

    // Assignment operator, used when making a copy of a WriteSet to help another thread
    WriteSet& operator = (const WriteSet &other) {
        numStores = other.numStores;
        for (uint64_t i = 0; i < numStores; i++) log[i] = other.log[i];
        return *this;
    }

    // Applies all entries in the log as DCASes.
    // Seq must match for DCAS to succeed. This method is on the "hot-path".
    inline void apply(uint64_t seq, const int tid) {
        for (uint64_t i = 0; i < numStores; i++) {
            // Use an heuristic to give each thread 8 consecutive DCAS to apply
            WriteSetEntry& e = log[(tid*8 + i) % numStores];
            tmtypebase<uint64_t>* tmte = (tmtypebase<uint64_t>*)e.addr;
            uint64_t lval = tmte->val.load(std::memory_order_acquire);
            uint64_t lseq = tmte->seq.load(std::memory_order_acquire);
            if (lseq < seq) DCAS((uint64_t*)e.addr, lval, lseq, e.val, seq);
        }
    }
};


// The persistent metadata is a 'header' that contains all the logs and the persistent curTx variable.
// It is located at the start of the persistent region, and the remaining region contains the data available for the allocator to use.
struct PMetadata {
    static const uint64_t   MAGIC_ID = 0x1337babe;
    std::atomic<uint64_t>   curTx {seqidx2trans(1,0)};
    std::atomic<uint64_t>   pad1[15];
    pthread_mutex_t         usedTID[REGISTRY_MAX_THREADS];   // Which TIDs are in use by threads
    std::atomic<int64_t>    maxTid {-1};                     // Highest TID (+1) in use by threads
    uint64_t                pad2 {0};
    tmtypebase<void*>       rootPtrs[MAX_ROOT_POINTERS];
    PWriteSet               plog[REGISTRY_MAX_THREADS];
    WriteSet                writeSets[REGISTRY_MAX_THREADS];
    uint64_t                id {0};
    uint64_t                pad3 {0};
};


// Forward declaration
struct OpData;
// This is used by addOrReplace() to know which OpDesc instance to use for the current transaction
extern thread_local OpData* tl_opdata;


// Its purpose is to hold thread-local data
struct OpData {
    uint64_t      curTx {0};              // Used during a transaction to keep the value of curTx read in beginTx() (owner thread only)
    uint64_t      nestedTrans {0};        // Thread-local: Number of nested transactions
    PWriteSet*    pWriteSet {nullptr};    // Pointer to the redo log in persistent memory
    uint64_t      padding[16-3];          // Padding to avoid false-sharing in nestedTrans and curTx
};


// Used to identify aborted transactions
struct AbortedTx {};
static constexpr AbortedTx AbortedTxException {};

class OneFileLF;
extern OneFileLF gOFLF;


/**
 * <h1> One-File PTM (Lock-Free) </h1>
 *
 * One-File is a Persistent Software Transacional Memory with lock-free progress, meant to
 * implement lock-free data structures. It has integrated lock-free memory
 * reclamation using an optimistic memory scheme
 *
 * OF is a word-based PTM and it uses double-compare-and-swap (DCAS).
 *
 * Right now it has several limitations, some will be fixed in the future, some may be hard limitations of this approach:
 * - We can't have stack allocated tmtype<> variables. For example, we can't created inside a transaction "tmtpye<uint64_t> tmp = a;",
 *   it will give weird errors because of stack allocation.
 * - We need DCAS but it can be emulated with LL/SC or even with single-word CAS
 *   if we do redirection to a (lock-free) pool with SeqPtrs;
 */
class OneFileLF {
private:
    static const bool                    debug = false;
    OpData                              *opData;
    int                                  fd {-1};

public:
    EsLoco<tmtype>                       esloco {};
    PMetadata*                           pmd {nullptr};
    std::atomic<uint64_t>*               curTx {nullptr};              // Pointer to persistent memory location of curTx (it's in PMetadata)
    WriteSet*                            writeSets;                    // One write-set for each thread

    OneFileLF() {
        opData = new OpData[REGISTRY_MAX_THREADS];
        mapPersistentRegion(PFILE_NAME, PREGION_ADDR, PREGION_SIZE);
    }

    ~OneFileLF() {
        delete[] opData;
    }

    static std::string className() { return "OneFilePTM-MultiProcess-LF"; }

    void mapPersistentRegion(const char* filename, uint8_t* regionAddr, const uint64_t regionSize) {
        // Check that the header with the logs leaves at least half the memory available to the user
        if (sizeof(PMetadata) > regionSize/2) {
            printf("ERROR: the size of the logs in persistent memory is so large that it takes more than half the whole persistent memory\n");
            printf("Please reduce some of the settings in OneFilePTMMultiProcess.hpp and try again\n");
            assert(false);
        }
        bool reuseRegion = false;
        // Check if the file already exists or not
        struct stat buf;
        if (stat(filename, &buf) == 0) {
            // File exists
            fd = open(filename, O_RDWR|O_CREAT, 0755);
            assert(fd >= 0);
            reuseRegion = true;
        } else {
            // File doesn't exist
            fd = open(filename, O_RDWR|O_CREAT, 0755);
            assert(fd >= 0);
            if (lseek(fd, regionSize-1, SEEK_SET) == -1) {
                perror("lseek() error");
            }
            if (write(fd, "", 1) == -1) {
                perror("write() error");
            }
        }
        // mmap() memory range
        void* got_addr = (uint8_t *)mmap(regionAddr, regionSize, (PROT_READ | PROT_WRITE), MAP_SHARED, fd, 0);
        if (got_addr == MAP_FAILED || got_addr != regionAddr) {
            printf("got_addr = %p  instead of %p\n", got_addr, regionAddr);
            perror("ERROR: mmap() is not working !!! ");
            assert(false);
        }
        // Check if the header is consistent and only then can we attempt to re-use, otherwise we clear everything that's there
        pmd = reinterpret_cast<PMetadata*>(regionAddr);
        if (reuseRegion) reuseRegion = (pmd->id == PMetadata::MAGIC_ID);
        // Map pieces of persistent Metadata to pointers in volatile memory
        for (uint64_t i = 0; i < REGISTRY_MAX_THREADS; i++) opData[i].pWriteSet = &(pmd->plog[i]);
        curTx = &pmd->curTx;
        writeSets = (WriteSet*)&pmd->writeSets;
        // If the file has just been created or if the header is not consistent, clear everything.
        // Otherwise, re-use and recover to a consistent state.
        if (reuseRegion) {
            gThreadRegistry.init((pthread_mutex_t*)&pmd->usedTID, &pmd->maxTid, false);
            esloco.init(regionAddr+sizeof(PMetadata), regionSize-sizeof(PMetadata), false);
            //recover(); // Not needed on x86
        } else {
            // Start by resetting all tmtypes::seq in the metadata region
            std::memset(regionAddr, 0, sizeof(PMetadata));
            new (regionAddr) PMetadata();
            gThreadRegistry.init((pthread_mutex_t*)&pmd->usedTID, &pmd->maxTid, true);
            esloco.init(regionAddr+sizeof(PMetadata), regionSize-sizeof(PMetadata), true);
            PFENCE();
            pmd->id = PMetadata::MAGIC_ID;
            PWB(&pmd->id);
            PFENCE();
        }
    }

    // Progress Condition: lock-free
    // The while-loop retarts only if there was at least one other thread completing a transaction
    void beginTx(OpData& myopd, const int tid) {
        tl_is_read_only = true;
        while (true) {
            myopd.curTx = curTx->load(std::memory_order_acquire);
            helpApply(myopd.curTx, tid);
            // Reset the write-set after (possibly) helping another transaction complete
            writeSets[tid].numStores = 0;
            // Start over if there is already a new transaction
            if (myopd.curTx == curTx->load(std::memory_order_acquire)) return;
        }
    }

    // Progress condition: wait-free population-oblivious
    // Attempts to publish our write-set (commit the transaction) and then applies the write-set.
    // Returns true if my transaction was committed.
    inline bool commitTx(OpData& myopd, const int tid) {
        // If it's a read-only transaction, then commit immediately
        if (writeSets[tid].numStores == 0) return true;
        // Give up if the curTx has changed sinced our transaction started
        if (myopd.curTx != curTx->load(std::memory_order_acquire)) return false;
        // Move our request to OPEN, using the sequence of the previous transaction +1
        const uint64_t seq = trans2seq(myopd.curTx);
        const uint64_t newTx = seqidx2trans(seq+1,tid);
        myopd.pWriteSet->request.store(newTx, std::memory_order_release);
        // Copy the write-set to persistent memory and flush it
        writeSets[tid].persistAndFlushLog(myopd.pWriteSet);
        // Attempt to CAS curTx to our OpDesc instance (tid) incrementing the seq in it
        uint64_t lcurTx = myopd.curTx;
        if (debug) printf("tid=%i  attempting CAS on curTx from (%ld,%ld) to (%ld,%ld)\n", tid, trans2seq(lcurTx), trans2idx(lcurTx), seq+1, (uint64_t)tid);
        if (!curTx->compare_exchange_strong(lcurTx, newTx)) return false;
        PWB(curTx);
        // Execute each store in the write-set using DCAS() and close the request
        helpApply(newTx, tid);
        // We should need a PSYNC() here to provide durable linearizabilty, but the CAS of the state in helpApply() acts as a PSYNC() (on x86).
        if (debug) printf("Committed transaction (%ld,%ld) with %ld stores\n", seq+1, (uint64_t)tid, writeSets[tid].numStores);
        return true;
    }

    // Same as beginTx/endTx transaction, but with lambdas, and it handles AbortedTx exceptions
    template<typename R, typename F> R transaction(F&& func) {
        const int tid = ThreadRegistry::getTID();
        OpData& myopd = opData[tid];
        if (myopd.nestedTrans > 0) return func();
        ++myopd.nestedTrans;
        tl_opdata = &myopd;
        R retval {};
        while (true) {
            beginTx(myopd, tid);
            try {
                retval = func();
            } catch (AbortedTx&) {
                continue;
            }
            if (commitTx(myopd, tid)) break;
        }
        tl_opdata = nullptr;
        --myopd.nestedTrans;
        return retval;
    }

    // Same as above, but returns void
    template<typename F> void transaction(F&& func) {
        const int tid = ThreadRegistry::getTID();
        OpData& myopd = opData[tid];
        if (myopd.nestedTrans > 0) {
            func();
            return;
        }
        ++myopd.nestedTrans;
        tl_opdata = &myopd;
        while (true) {
            beginTx(myopd, tid);
            try {
                func();
            } catch (AbortedTx&) {
                continue;
            }
            if (commitTx(myopd, tid)) break;
        }
        tl_opdata = nullptr;
        --myopd.nestedTrans;
    }

    // It's silly that these have to be static, but we need them for the (SPS) benchmarks due to templatization
    template<typename R, typename F> static R updateTx(F&& func) { return gOFLF.transaction<R>(func); }
    template<typename R, typename F> static R readTx(F&& func) { return gOFLF.transaction<R>(func); }
    template<typename F> static void updateTx(F&& func) { gOFLF.transaction(func); }
    template<typename F> static void readTx(F&& func) { gOFLF.transaction(func); }

    template <typename T, typename... Args> static T* tmNew(Args&&... args) {
    //template <typename T> static T* tmNew() {
        T* ptr = (T*)gOFLF.esloco.malloc(sizeof(T));
        //new (ptr) T;  // new placement
        new (ptr) T(std::forward<Args>(args)...);
        return ptr;
    }

    template<typename T> static void tmDelete(T* obj) {
        if (obj == nullptr) return;
        obj->~T(); // Execute destructor as part of the current transaction
        tmFree(obj);
    }

    static void* tmMalloc(size_t size) {
        if (tl_opdata == nullptr) {
            printf("ERROR: Can not allocate outside a transaction\n");
            return nullptr;
        }
        void* obj = gOFLF.esloco.malloc(size);
        return obj;
    }

    static void tmFree(void* obj) {
        if (obj == nullptr) return;
        if (tl_opdata == nullptr) {
            printf("ERROR: Can not de-allocate outside a transaction\n");
            return;
        }
        gOFLF.esloco.free(obj);
    }

    static void* pmalloc(size_t size) {
        return gOFLF.esloco.malloc(size);
    }

    static void pfree(void* obj) {
        if (obj == nullptr) return;
        gOFLF.esloco.free(obj);
    }

    template <typename T> static inline T* get_object(int idx) {
        tmtype<T*>* ptr = (tmtype<T*>*)&(gOFLF.pmd->rootPtrs[idx]);
        return ptr->pload();
    }

    template <typename T> static inline void put_object(int idx, T* obj) {
        tmtype<T*>* ptr = (tmtype<T*>*)&(gOFLF.pmd->rootPtrs[idx]);
        ptr->pstore(obj);
    }

private:
    // Progress condition: wait-free population oblivious
    inline void helpApply(uint64_t lcurTx, const int tid) {
        const uint64_t idx = trans2idx(lcurTx);
        const uint64_t seq = trans2seq(lcurTx);
        OpData& opd = opData[idx];
        // Nothing to apply unless the request matches the curTx
        if (lcurTx != opd.pWriteSet->request.load(std::memory_order_acquire)) return;
        if (idx != tid) {
            // Make a copy of the write-set and check if it is consistent
            writeSets[tid] = writeSets[idx];
            std::atomic_thread_fence(std::memory_order_acquire);
            if (lcurTx != curTx->load()) return;
            if (lcurTx != opd.pWriteSet->request.load(std::memory_order_acquire)) return;
        }
        if (debug) printf("Applying %ld stores in write-set\n", writeSets[tid].numStores);
        writeSets[tid].apply(seq, tid);
        writeSets[tid].flushModifications();
        const uint64_t newReq = seqidx2trans(seq+1,idx);
        if (opd.pWriteSet->request.load(std::memory_order_acquire) == lcurTx) {
            opd.pWriteSet->request.compare_exchange_strong(lcurTx, newReq);
        }
    }

    // Upon restart, re-applies the last transaction, so as to guarantee that
    // we have a consistent state in persistent memory.
    // This is not needed on x86, where the DCAS has atomicity writting to persistent memory.
    void recover() {
        uint64_t lcurTx = curTx->load(std::memory_order_acquire);
        opData[trans2idx(lcurTx)].pWriteSet->applyFromRecover();
        PSYNC();
    }
};


// T is typically a pointer to a node, but it can be integers or other stuff, as long as it fits in 64 bits
template<typename T> struct tmtype : tmtypebase<T> {
    tmtype() { }

    tmtype(T initVal) { pstore(initVal); }

    // Casting operator
    operator T() { return pload(); }

    // Prefix increment operator: ++x
    void operator++ () { pstore(pload()+1); }
    // Prefix decrement operator: --x
    void operator-- () { pstore(pload()-1); }
    void operator++ (int) { pstore(pload()+1); }
    void operator-- (int) { pstore(pload()-1); }

    // Equals operator: first downcast to T and then compare
    bool operator == (const T& otherval) const { return pload() == otherval; }

    // Difference operator: first downcast to T and then compare
    bool operator != (const T& otherval) const { return pload() != otherval; }

    // Relational operators
    bool operator < (const T& rhs) { return pload() < rhs; }
    bool operator > (const T& rhs) { return pload() > rhs; }
    bool operator <= (const T& rhs) { return pload() <= rhs; }
    bool operator >= (const T& rhs) { return pload() >= rhs; }

    // Operator arrow ->
    T operator->() { return pload(); }

    // Copy constructor
    tmtype<T>(const tmtype<T>& other) { pstore(other.pload()); }

    // Assignment operator from an tmtype
    tmtype<T>& operator=(const tmtype<T>& other) {
        pstore(other.pload());
        return *this;
    }

    // Assignment operator from a value
    tmtype<T>& operator=(T value) {
        pstore(value);
        return *this;
    }

    // Operator &
    T* operator&() {
        return (T*)this;
    }

    // Meant to be called when know we're the only ones touching
    // these contents, for example, in the constructor of an object, before
    // making the object visible to other threads.
    inline void isolated_store(T newVal) {
        tmtypebase<T>::val.store((uint64_t)newVal, std::memory_order_relaxed);
    }

    // We don't need to check curTx here because we're not de-referencing
    // the val. It's only after a load() that the val may be de-referenced
    // (in user code), therefore we do the check on load() only.
    inline void pstore(T newVal) {
        OpData* const myopd = tl_opdata;
        if (myopd == nullptr) { // Looks like we're outside a transaction
            tmtypebase<T>::val.store((uint64_t)newVal, std::memory_order_relaxed);
        } else {
            gOFLF.writeSets[tl_tcico.tid].addOrReplace(this, (uint64_t)newVal);
        }
    }

    // We have to check if there is a new ongoing transaction and if so, abort
    // this execution immediately for two reasons:
    // 1. Memory Reclamation: the val we're returning may be a pointer to an
    // object that has since been retired and deleted, therefore we can't allow
    // user code to de-reference it;
    // 2. Invariant Conservation: The val we're reading may be from a newer
    // transaction, which implies that it may break an invariant in the user code.
    // See examples of invariant breaking in this post:
    // http://concurrencyfreaks.com/2013/11/stampedlocktryoptimisticread-and.html
    inline T pload() const {
        T lval = (T)tmtypebase<T>::val.load(std::memory_order_acquire);
        OpData* const myopd = tl_opdata;
        if (myopd == nullptr) return lval;
        if ((uint8_t*)this < PREGION_ADDR || (uint8_t*)this > PREGION_END) return lval;
        uint64_t lseq = tmtypebase<T>::seq.load(std::memory_order_acquire);
        if (lseq > trans2seq(myopd->curTx)) throw AbortedTxException;
        if (tl_is_read_only) return lval;
        return (T)gOFLF.writeSets[tl_tcico.tid].lookupAddr(this, (uint64_t)lval);
    }
};


//
// Wrapper methods to the global TM instance. The user should use these:
//
template<typename R, typename F> static R updateTx(F&& func) { return gOFLF.transaction<R>(func); }
template<typename R, typename F> static R readTx(F&& func) { return gOFLF.transaction<R>(func); }
template<typename F> static void updateTx(F&& func) { gOFLF.transaction(func); }
template<typename F> static void readTx(F&& func) { gOFLF.transaction(func); }
template<typename T, typename... Args> T* tmNew(Args&&... args) { return OneFileLF::tmNew<T>(std::forward<Args>(args)...); }
template<typename T> void tmDelete(T* obj) { OneFileLF::tmDelete<T>(obj); }
template<typename T> static T* get_object(int idx) { return OneFileLF::get_object<T>(idx); }
template<typename T> static void put_object(int idx, T* obj) { OneFileLF::put_object<T>(idx, obj); }
inline static void* tmMalloc(size_t size) { return OneFileLF::tmMalloc(size); }
inline static void tmFree(void* obj) { OneFileLF::tmFree(obj); }


//
// Place these in a .cpp if you include this header from multiple files (compilation units)
//
OneFileLF gOFLF {};
thread_local OpData* tl_opdata {nullptr};
// Global/singleton to hold all the thread registry functionality
ThreadRegistry gThreadRegistry {};
// During a transaction, this is true up until the first store()
thread_local bool tl_is_read_only {false};
// This is where every thread stores the tid it has been assigned when it calls getTID() for the first time.
// When the thread dies, the destructor of ThreadCheckInCheckOut will be called and de-register the thread.
thread_local ThreadCheckInCheckOut tl_tcico {};
// Helper function for thread de-registration
void thread_registry_deregister_thread(const int tid) {
    gThreadRegistry.deregister_thread(tid);
}

}
#endif /* _PERSISTENT_ONE_FILE_LOCK_FREE_TRANSACTIONAL_MEMORY_H_ */


================================================
FILE: ptms/OneFilePTMWF.hpp
================================================
/*
 * Copyright 2017-2019
 *   Andreia Correia <andreia.veiga@unine.ch>
 *   Pedro Ramalhete <pramalhe@gmail.com>
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Nachshon Cohen <nachshonc@gmail.com>
 *
 * This work is published under the MIT license. See LICENSE.txt
 */
#ifndef _PERSISTENT_ONE_FILE_WAIT_FREE_TRANSACTIONAL_MEMORY_H_
#define _PERSISTENT_ONE_FILE_WAIT_FREE_TRANSACTIONAL_MEMORY_H_

#include <atomic>
#include <cassert>
#include <iostream>
#include <vector>
#include <functional>
#include <cstring>
#include <sys/mman.h>   // Needed if we use mmap()
#include <sys/types.h>  // Needed by open() and close()
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>     // Needed by close()

// Please keep this file in sync (as much as possible) with stms/OneFileWF.hpp

// Macros needed for persistence
#ifdef PWB_IS_CLFLUSH
  /*
   * More info at http://elixir.free-electrons.com/linux/latest/source/arch/x86/include/asm/special_insns.h#L213
   * Intel programming manual at https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
   * Use these for Broadwell CPUs (cervino server)
   */
  #define PWB(addr)              __asm__ volatile("clflush (%0)" :: "r" (addr) : "memory")                      // Broadwell only works with this.
  #define PFENCE()               {}                                                                             // No ordering fences needed for CLFLUSH (section 7.4.6 of Intel manual)
  #define PSYNC()                {}                                                                             // For durability it's not obvious, but CLFLUSH seems to be enough, and PMDK uses the same approach
#elif PWB_IS_CLWB
  /* Use this for CPUs that support clwb, such as the SkyLake SP series (c5 compute intensive instances in AWS are an example of it) */
  #define PWB(addr)              __asm__ volatile(".byte 0x66; xsaveopt %0" : "+m" (*(volatile char *)(addr)))  // clwb() only for Ice Lake onwards
  #define PFENCE()               __asm__ volatile("sfence" : : : "memory")
  #define PSYNC()                __asm__ volatile("sfence" : : : "memory")
#elif PWB_IS_NOP
  /* pwbs are not needed for shared memory persistency (i.e. persistency across process failure) */
  #define PWB(addr)              {}
  #define PFENCE()               __asm__ volatile("sfence" : : : "memory")
  #define PSYNC()                __asm__ volatile("sfence" : : : "memory")
#elif PWB_IS_CLFLUSHOPT
  /* Use this for CPUs that support clflushopt, which is most recent x86 */
  #define PWB(addr)              __asm__ volatile(".byte 0x66; clflush %0" : "+m" (*(volatile char *)(addr)))    // clflushopt (Kaby Lake)
  #define PFENCE()               __asm__ volatile("sfence" : : : "memory")
  #define PSYNC()                __asm__ volatile("sfence" : : : "memory")
#else
#error "You must define what PWB is. Choose PWB_IS_CLFLUSHOPT if you don't know what your CPU is capable of"
#endif


/*
 * Differences between POneFileWF and the non-persistent OneFileWF:
 * - A secondary redo log (PWriteSet) is placed in persistent memory before attempting a 'commit'.
 * - The set of the request in helpApply() is always done with a CAS to enforce ordering on the PWBs of the DCAS;
 * - The persistent logs are allocated in PM, same as all user allocations from tmNew(), 'curTx', and 'request'
 */
namespace pofwf {

//
// User configurable variables.
// Feel free to change these if you need larger transactions, more allocations per transacation, or more threads.
//

// Maximum number of registered threads that can execute transactions
static const int REGISTRY_MAX_THREADS = 128;
// Maximum number of stores in the WriteSet per transaction
static const uint64_t TX_MAX_STORES = 40*1024;
// Number of buckets in the hashmap of the WriteSet.
static const uint64_t HASH_BUCKETS = 2048;

// Persistent-specific configuration
// Name of persistent file mapping
static const char * PFILE_NAME = "/dev/shm/ponefilewf_shared";
// Start address of mapped persistent memory
static uint8_t* PREGION_ADDR = (uint8_t*)0x7ff000000000;
// Size of persistent memory. Part of it will be used by the redo logs
static const uint64_t PREGION_SIZE = 1024*1024*1024ULL;    // 1 GB by default
// End address of mapped persistent memory
static uint8_t* PREGION_END = (PREGION_ADDR+PREGION_SIZE);
// Maximum number of root pointers available for the user
static const uint64_t MAX_ROOT_POINTERS = 100;


// DCAS / CAS2 macro
#define DCAS(ptr, o1, o2, n1, n2)                               \
({                                                              \
    char __ret;                                                 \
    __typeof__(o2) __junk;                                      \
    __typeof__(*(ptr)) __old1 = (o1);                           \
    __typeof__(o2) __old2 = (o2);                               \
    __typeof__(*(ptr)) __new1 = (n1);                           \
    __typeof__(o2) __new2 = (n2);                               \
    asm volatile("lock cmpxchg16b %2;setz %1"                   \
                   : "=d"(__junk), "=a"(__ret), "+m" (*ptr)     \
                   : "b"(__new1), "c"(__new2),                  \
                     "a"(__old1), "d"(__old2));                 \
    __ret; })


// Functions to convert between a transaction identifier (uint64_t) and a pair of {sequence,index}
static inline uint64_t seqidx2trans(uint64_t seq, uint64_t idx) {
    return (seq << 10) | idx;
}
static inline uint64_t trans2seq(uint64_t trans) {
    return trans >> 10;
}
static inline uint64_t trans2idx(uint64_t trans) {
    return trans & 0x3FF; // 10 bits
}

// Flush each cache line in a range
static inline void flushFromTo(void* from, void* to) noexcept {
    const uint64_t cache_line_size = 64;
    uint8_t* ptr = (uint8_t*)(((uint64_t)from) & (~(cache_line_size-1)));
    for (; ptr < (uint8_t*)to; ptr += cache_line_size) PWB(ptr);
}


//
// Thread Registry stuff
//
extern void thread_registry_deregister_thread(const int tid);

// An helper class to do the checkin and checkout of the thread registry
struct ThreadCheckInCheckOut {
    static const int NOT_ASSIGNED = -1;
    int tid { NOT_ASSIGNED };
    ~ThreadCheckInCheckOut() {
        if (tid == NOT_ASSIGNED) return;
        thread_registry_deregister_thread(tid);
    }
};

extern thread_local ThreadCheckInCheckOut tl_tcico;

// Forward declaration of global/singleton instance
class ThreadRegistry;
extern ThreadRegistry gThreadRegistry;

/*
 * <h1> Registry for threads </h1>
 *
 * This is singleton type class that allows assignement of a unique id to each thread.
 * The first time a thread calls ThreadRegistry::getTID() it will allocate a free slot in 'usedTID[]'.
 * This tid wil be saved in a thread-local variable of the type ThreadCheckInCheckOut which
 * upon destruction of the thread will call the destructor of ThreadCheckInCheckOut and free the
 * corresponding slot to be used by a later thread.
 */
class ThreadRegistry {
private:
    alignas(128) std::atomic<bool>      usedTID[REGISTRY_MAX_THREADS];   // Which TIDs are in use by threads
    alignas(128) std::atomic<int>       maxTid {-1};                     // Highest TID (+1) in use by threads

public:
    ThreadRegistry() {
        for (int it = 0; it < REGISTRY_MAX_THREADS; it++) {
            usedTID[it].store(false, std::memory_order_relaxed);
        }
    }

    // Progress condition: wait-free bounded (by the number of threads)
    int register_thread_new(void) {
        for (int tid = 0; tid < REGISTRY_MAX_THREADS; tid++) {
            if (usedTID[tid].load(std::memory_order_acquire)) continue;
            bool unused = false;
            if (!usedTID[tid].compare_exchange_strong(unused, true)) continue;
            // Increase the current maximum to cover our thread id
            int curMax = maxTid.load();
            while (curMax <= tid) {
                maxTid.compare_exchange_strong(curMax, tid+1);
                curMax = maxTid.load();
            }
            tl_tcico.tid = tid;
            return tid;
        }
        std::cout << "ERROR: Too many threads, registry can only hold " << REGISTRY_MAX_THREADS << " threads\n";
        assert(false);
    }

    // Progress condition: wait-free population oblivious
    inline void deregister_thread(const int tid) {
        usedTID[tid].store(false, std::memory_order_release);
    }

    // Progress condition: wait-free population oblivious
    static inline uint64_t getMaxThreads(void) {
        return gThreadRegistry.maxTid.load(std::memory_order_acquire);
    }

    // Progress condition: wait-free bounded (by the number of threads)
    static inline int getTID(void) {
        int tid = tl_tcico.tid;
        if (tid != ThreadCheckInCheckOut::NOT_ASSIGNED) return tid;
        return gThreadRegistry.register_thread_new();
    }
};


// Each object tracked by Hazard Eras needs to have tmbase as one of its base classes.
struct tmbase {
    uint64_t newEra_ {0};        // Filled by tmNew() or tmMalloc()
    uint64_t delEra_ {0};        // Filled by tmDelete() or tmFree()
};


// A wrapper to std::function so that we can track it with Hazard Eras
struct TransFunc : public tmbase {
    std::function<uint64_t()> func;
    template<typename F> TransFunc(F&& f) : func{f} { }
};


// This is a specialized implementation of Hazard Eras meant to be used in the OneFile STM.
// Hazard Eras is a lock-free memory reclamation technique described here:
// https://github.com/pramalhe/ConcurrencyFreaks/blob/master/papers/hazarderas-2017.pdf
// https://dl.acm.org/citation.cfm?id=3087588
//
// We're using OF::curTx.seq as the global era.
//
// This implementation is different from the lock-free OneFile STM because we need
// to track the lifetime of the std::function objects where the lambdas are put.
class HazardErasOF {
private:
    static const uint64_t                    NOERA = 0;
    static const int                         CLPAD = 128/sizeof(std::atomic<uint64_t>);
    static const int                         THRESHOLD_R = 0; // This is named 'R' in the HP paper
    const unsigned int                       maxThreads;
    alignas(128) std::atomic<uint64_t>*      he;
    // It's not nice that we have a lot of empty vectors, but we need padding to avoid false sharing
    alignas(128) std::vector<TransFunc*>     retiredListTx[REGISTRY_MAX_THREADS*CLPAD];

public:
    HazardErasOF(unsigned int maxThreads=REGISTRY_MAX_THREADS) : maxThreads{maxThreads} {
        he = new std::atomic<uint64_t>[REGISTRY_MAX_THREADS*CLPAD];
        for (unsigned it = 0; it < REGISTRY_MAX_THREADS; it++) {
            he[it*CLPAD].store(NOERA, std::memory_order_relaxed);
            retiredListTx[it*CLPAD].reserve(REGISTRY_MAX_THREADS);
        }
    }

    ~HazardErasOF() {
        // Clear the objects in the retired lists
        for (unsigned it = 0; it < maxThreads; it++) {
            for (unsigned iret = 0; iret < retiredListTx[it*CLPAD].size(); iret++) {
                TransFunc* tx = retiredListTx[it*CLPAD][iret];
                delete tx;
            }
        }
        delete[] he;
    }

    // Progress condition: wait-free population oblivious
    inline void clear(const int tid) {
        he[tid*CLPAD].store(NOERA, std::memory_order_release);
    }

    // Progress condition: wait-free population oblivious
    inline void set(uint64_t trans, const int tid) {
        he[tid*CLPAD].store(trans2seq(trans));
    }

    // Progress condition: wait-free population oblivious
    inline void addToRetiredListTx(TransFunc* tx, const int tid) {
        retiredListTx[tid*CLPAD].push_back(tx);
    }

    /**
     * Progress condition: bounded wait-free
     *
     * Attemps to delete the no-longer-in-use objects in the retired list.
     * We need to pass the currEra coming from the seq of the currTx so that
     * the objects from the current transaction don't get deleted.
     *
     * TODO: consider using erase() with std::remove_if()
     */
    void clean(uint64_t curEra, const int tid) {
        for (unsigned iret = 0; iret < retiredListTx[tid*CLPAD].size();) {
            TransFunc* tx = retiredListTx[tid*CLPAD][iret];
            if (canDelete(curEra, tx)) {
                retiredListTx[tid*CLPAD].erase(retiredListTx[tid*CLPAD].begin() + iret);
                delete tx;
                continue;
            }
            iret++;
        }
    }

    // Progress condition: wait-free bounded (by the number of threads)
    inline bool canDelete(uint64_t curEra, tmbase* del) {
        // We can't delete objects from the current transaction
        if (del->delEra_ == curEra) return false;
        for (unsigned it = 0; it < ThreadRegistry::getMaxThreads(); it++) {
            const auto era = he[it*CLPAD].load(std::memory_order_acquire);
            if (era == NOERA || era < del->newEra_ || era > del->delEra_) continue;
            return false;
        }
        return true;
    }
};


// T is typically a pointer to a node, but it can be integers or other stuff, as long as it fits in 64 bits
template<typename T> struct tmtype {
    // Stores the actual value as an atomic
    std::atomic<uint64_t>  val;
    // Lets hope this comes immediately after 'val' in memory mapping, otherwise the DCAS() will fail
    std::atomic<uint64_t>  seq;

    tmtype() { }

    tmtype(T initVal) { pstore(initVal); }

    // Casting operator
    operator T() { return pload(); }

    // Prefix increment operator: ++x
    void operator++ () { pstore(pload()+1); }
    // Prefix decrement operator: --x
    void operator-- () { pstore(pload()-1); }
    void operator++ (int) { pstore(pload()+1); }
    void operator-- (int) { pstore(pload()-1); }

    // Equals operator: first downcast to T and then compare
    bool operator == (const T& otherval) const { return pload() == otherval; }

    // Difference operator: first downcast to T and then compare
    bool operator != (const T& otherval) const { return pload() != otherval; }

    // Relational operators
    bool operator < (const T& rhs) { return pload() < rhs; }
    bool operator > (const T& rhs) { return pload() > rhs; }
    bool operator <= (const T& rhs) { return pload() <= rhs; }
    bool operator >= (const T& rhs) { return pload() >= rhs; }

    // Operator arrow ->
    T operator->() { return pload(); }

    // Copy constructor
    tmtype<T>(const tmtype<T>& other) { pstore(other.pload()); }

    // Assignment operator from an tmtype
    tmtype<T>& operator=(const tmtype<T>& other) {
        pstore(other.pload());
        return *this;
    }

    // Assignment operator from a value
    tmtype<T>& operator=(T value) {
        pstore(value);
        return *this;
    }

    // Operator &
    T* operator&() {
        return (T*)this;
    }

    // Meant to be called when know we're the only ones touching
    // these contents, for example, in the constructor of an object, before
    // making the object visible to other threads.
    inline void isolated_store(T newVal) {
        val.store((uint64_t)newVal, std::memory_order_relaxed);
    }

    // Used only internally to initialize the operations[] array
    inline void operationsInit() {
        val.store((uint64_t)nullptr, std::memory_order_relaxed);
        seq.store(0, std::memory_order_relaxed);
    }

    // Used only internally to initialize the results[] array
    inline void resultsInit() {
        val.store(0, std::memory_order_relaxed);
        seq.store(1, std::memory_order_relaxed);
    }

    // Used only internally by updateTx() to determine if the request is opened or not
    inline uint64_t getSeq() const {
        return seq.load(std::memory_order_acquire);
    }

    // Used only internally by updateTx()
    inline void rawStore(T& newVal, uint64_t lseq) {
        val.store((uint64_t)newVal, std::memory_order_relaxed);
        seq.store(lseq, std::memory_order_release);
    }

    // Methods that are defined later because they have compilation dependencies on gOFWF
    inline T pload() const;
    inline bool rawLoad(T& keepVal, uint64_t& keepSeq);
    inline void pstore(T newVal);
};


/*
 * EsLoco is an Extremely Simple memory aLOCatOr
 *
 * It is based on intrusive singly-linked lists (a free-list), one for each power of two size.
 * All blocks are powers of two, the smallest size enough to contain the desired user data plus the block header.
 * There is an array named 'freelists' where each entry is a pointer to the head of a stack for that respective block size.
 * Blocks are allocated in powers of 2 of words (64bit words).
 * Each block has an header with two words: the size of the node (in words), the pointer to the next node.
 * The minimum block size is 4 words, with 2 for the header and 2 for the user.
 * When there is no suitable block in the freelist, it will create a new block from the remaining pool.
 *
 * EsLoco was designed for usage in PTMs but it doesn't have to be used only for that.
 * Average number of stores for an allocation is 1.
 * Average number of stores for a de-allocation is 2.
 *
 * Memory layout:
 * ------------------------------------------------------------------------
 * | poolTop | freelists[0] ... freelists[61] | ... allocated objects ... |
 * ------------------------------------------------------------------------
 */
template <template <typename> class P>
class EsLoco {
private:
    struct block {
        P<block*>   next;   // Pointer to next block in free-list (when block is in free-list)
        P<uint64_t> size;   // Exponent of power of two of the size of this block in bytes.
    };

    const bool debugOn = false;

    // Volatile data
    uint8_t* poolAddr {nullptr};
    uint64_t poolSize {0};

    // Pointer to array of persistent heads of free-list
    block* freelists {nullptr};
    // Volatile pointer to persistent pointer to last unused address (the top of the pool)
    P<uint8_t*>* poolTop {nullptr};

    // Number of blocks in the freelists array.
    // Each entry corresponds to an exponent of the block size: 2^4, 2^5, 2^6... 2^40
    static const int kMaxBlockSize = 50; // 1024 TB of memory should be enough

    // For powers of 2, returns the highest bit, otherwise, returns the next highest bit
    uint64_t highestBit(uint64_t val) {
        uint64_t b = 0;
        while ((val >> (b+1)) != 0) b++;
        if (val > (1ULL << b)) return b+1;
        return b;
    }

    uint8_t* aligned(uint8_t* addr) {
        return (uint8_t*)((size_t)addr & (~0x3FULL)) + 128;
    }

public:
    void init(void* addressOfMemoryPool, size_t sizeOfMemoryPool, bool clearPool=true) {
        // Align the base address of the memory pool
        poolAddr = aligned((uint8_t*)addressOfMemoryPool);
        poolSize = sizeOfMemoryPool + (uint8_t*)addressOfMemoryPool - poolAddr;
        // The first thing in the pool is a pointer to the top of the pool
        poolTop = (P<uint8_t*>*)poolAddr;
        // The second thing in the pool is the array of freelists
        freelists = (block*)(poolAddr + sizeof(*poolTop));
        if (clearPool) {
            std::memset(poolAddr, 0, poolSize);
            for (int i = 0; i < kMaxBlockSize; i++) freelists[i].next.pstore(nullptr);
            // The size of the freelists array in bytes is sizeof(block)*kMaxBlockSize
            // Align to cache line boundary (DCAS needs 16 byte alignment)
            poolTop->pstore(aligned(poolAddr + sizeof(*poolTop) + sizeof(block)*kMaxBlockSize));
        }
        if (debugOn) printf("Starting EsLoco with poolAddr=%p and poolSize=%ld, up to %p\n", poolAddr, poolSize, poolAddr+poolSize);
    }

    // Resets the metadata of the allocator back to its defaults
    void reset() {
        std::memset(poolAddr, 0, sizeof(block)*kMaxBlockSize);
        poolTop->pstore(nullptr);
    }

    // Returns the number of bytes that may (or may not) have allocated objects, from the base address to the top address
    uint64_t getUsedSize() {
        return poolTop->pload() - poolAddr;
    }

    // Takes the desired size of the object in bytes.
    // Returns pointer to memory in pool, or nullptr.
    // Does on average 1 store to persistent memory when re-utilizing blocks.
    void* malloc(size_t size) {
        P<uint8_t*>* top = (P<uint8_t*>*)(((uint8_t*)poolTop));
        block* flists = (block*)(((uint8_t*)freelists));
        // Adjust size to nearest (highest) power of 2
        uint64_t bsize = highestBit(size + sizeof(block));
        if (debugOn) printf("malloc(%ld) requested,  block size exponent = %ld\n", size, bsize);
        block* myblock = nullptr;
        // Check if there is a block of that size in the corresponding freelist
        if (flists[bsize].next.pload() != nullptr) {
            if (debugOn) printf("Found available block in freelist\n");
            // Unlink block
            myblock = flists[bsize].next;
            flists[bsize].next = myblock->next;          // pstore()
        } else {
            if (debugOn) printf("Creating new block from top, currently at %p\n", top->pload());
            // Couldn't find a suitable block, get one from the top of the pool if there is one available
            if (top->pload() + (1<<bsize) > poolSize + poolAddr) {
                printf("EsLoco: Out of memory for %ld bytes allocation\n", size);
                return nullptr;
            }
            myblock = (block*)top->pload();
            top->pstore(top->pload() + (1<<bsize));      // pstore()
            myblock->size = bsize;                       // pstore()
        }
        if (debugOn) printf("returning ptr = %p\n", (void*)((uint8_t*)myblock + sizeof(block)));
        // Return the block, minus the header
        return (void*)((uint8_t*)myblock + sizeof(block));
    }

    // Takes a pointer to an object and puts the block on the free-list.
    // Does on average 2 stores to persistent memory.
    void free(void* ptr) {
        if (ptr == nullptr) return;
        block* flists = (block*)(((uint8_t*)freelists));
        block* myblock = (block*)((uint8_t*)ptr - sizeof(block));
        if (debugOn) printf("free(%p)  block size exponent = %ld\n", ptr, myblock->size.pload());
        // Insert the block in the corresponding freelist
        myblock->next = flists[myblock->size].next;      // pstore()
        flists[myblock->size].next = myblock;            // pstore()
    }
};


// An entry in the persistent write-set (compacted for performance reasons)
struct PWriteSetEntry {
    void*    addr;  // Address of value+sequence to change
    uint64_t val;   // Desired value to change to
};


// The persistent write-set
struct PWriteSet {
    uint64_t              numStores {0};          // Number of stores in the writeSet for the current transaction
    std::atomic<uint64_t> request {0};            // Can be moved to CLOSED by other threads, using a CAS
    PWriteSetEntry        plog[TX_MAX_STORES];    // Redo log of stores

    // Applies all entries in the log. Called only by recover() which is non-concurrent.
    void applyFromRecover() {
        // We're assuming that 'val' is the size of a uint64_t
        for (uint64_t i = 0; i < numStores; i++) {
            *((uint64_t*)plog[i].addr) = plog[i].val;
            PWB(plog[i].addr);
        }
    }
};


// The persistent metadata is a 'header' that contains all the logs and the persistent curTx variable.
// It is located at the start of the persistent region, and the remaining region contains the data available for the allocator to use.
struct PMetadata {
    static const uint64_t   MAGIC_ID = 0x1337babe;
    std::atomic<uint64_t>   curTx {seqidx2trans(1,0)};
    std::atomic<uint64_t>   pad1[15];
    tmtype<void*>           rootPtrs[MAX_ROOT_POINTERS];
    PWriteSet               plog[REGISTRY_MAX_THREADS];
    uint64_t                id {0};
    uint64_t                pad2 {0};
};


// A single entry in the write-set
struct WriteSetEntry {
    void*          addr {nullptr};  // Address of value+sequence to change
    uint64_t       val;             // Desired value to change to
    WriteSetEntry* next {nullptr};  // Pointer to next node in the (intrusive) hash map
};

extern thread_local bool tl_is_read_only;


// The write-set is a log of the words modified during the transaction.
// This log is an array with an intrusive hashmap of size HASH_BUCKETS.
struct WriteSet {
    static const uint64_t MAX_ARRAY_LOOKUP = 30;  // Beyond this, it seems to be faster to use the hashmap
    WriteSetEntry         log[TX_MAX_STORES];     // Redo log of stores
    uint64_t              numStores {0};          // Number of stores in the writeSet for the current transaction
    WriteSetEntry*        buckets[HASH_BUCKETS];  // Intrusive HashMap for fast lookup in large(r) transactions

    WriteSet() {
        numStores = 0;
        for (int i = 0; i < HASH_BUCKETS; i++) buckets[i] = &log[TX_MAX_STORES-1];
    }

    // Copies the current write set to persistent memory
    inline void persistAndFlushLog(PWriteSet* const pwset) {
        for (uint64_t i = 0; i < numStores; i++) {
            pwset->plog[i].addr = log[i].addr;
            pwset->plog[i].val = log[i].val;
        }
        pwset->numStores = numStores;
        // Flush the log and the numStores variable
        flushFromTo(&pwset->numStores, &pwset->plog[numStores+1]);
    }

    // Uses the log to flush the modifications to NVM.
    // We assume tmtype does not cross cache line boundaries.
    inline void flushModifications() {
        for (uint64_t i = 0; i < numStores; i++) PWB(log[i].addr);
    }

    // Each address on a different bucket
    inline uint64_t hash(const void* addr) const {
        return (((uint64_t)addr) >> 3) % HASH_BUCKETS;
    }

    // Adds a modification to the redo log
    inline void addOrReplace(void* addr, uint64_t val) {
        if (tl_is_read_only) tl_is_read_only = false;
        const uint64_t hashAddr = hash(addr);
        if ((((size_t)addr & (~0xFULL)) != (size_t)addr)) {
            printf("Alignment ERROR in addOrReplace() at address %p\n", addr);
            assert(false);
        }
        if (numStores < MAX_ARRAY_LOOKUP) {
            // Lookup in array
            for (unsigned int idx = 0; idx < numStores; idx++) {
                if (log[idx].addr == addr) {
                    log[idx].val = val;
                    return;
                }
            }
        } else {
            // Lookup in hashmap
            WriteSetEntry* be = buckets[hashAddr];
            if (be < &log[numStores] && hash(be->addr) == hashAddr) {
                while (be != nullptr) {
                    if (be->addr == addr) {
                        be->val = val;
                        return;
                    }
                    be = be->next;
                }
            }
        }
        // Add to array
        WriteSetEntry* e = &log[numStores++];
        assert(numStores < TX_MAX_STORES);
        e->addr = addr;
        e->val = val;
        // Add to hashmap
        WriteSetEntry* be = buckets[hashAddr];
        // Clear if entry is from previous tx
        e->next = (be < e && hash(be->addr) == hashAddr) ? be : nullptr;
        buckets[hashAddr] = e;
    }

    // Does a lookup on the WriteSet for an addr.
    // If the numStores is lower than MAX_ARRAY_LOOKUP, the lookup is done on the log, otherwise, the lookup is done on the hashmap.
    // If it's not in the write-set, return lval.
    inline uint64_t lookupAddr(const void* addr, uint64_t lval) {
        if (numStores < MAX_ARRAY_LOOKUP) {
            // Lookup in array
            for (unsigned int idx = 0; idx < numStores; idx++) {
                if (log[idx].addr == addr) return log[idx].val;
            }
        } else {
            // Lookup in hashmap
            const uint64_t hashAddr = hash(addr);
            WriteSetEntry* be = buckets[hashAddr];
            if (be < &log[numStores] && hash(be->addr) == hashAddr) {
                while (be != nullptr) {
                    if (be->addr == addr) return be->val;
                    be = be->next;
                }
            }
        }
        return lval;
    }

    // Returns true if there is a predecent log entry for the same cache line
    inline bool lookupCacheLine(void* addr, int myidx) {
        size_t cl = (size_t)addr & (~0x3F);
        if (numStores < MAX_ARRAY_LOOKUP) {
            // Lookup in array
            for (unsigned int idx = 0; idx < myidx; idx++) {
                if ((size_t)log[idx].addr & (~0x3F) == cl) return true;
            }
        } else {
            // Lookup in hashmap. If it has the same cache line, it must be in this bucket
            WriteSetEntry* be = buckets[hash(addr)];
            if (be < &log[numStores]) {
                while (be != nullptr) {
                    if ((size_t)be->addr & (~0x3F) == cl) return true;
                    be = be->next;
                }
            }
        }
        return false;
    }

    // Assignment operator, used when making a copy of a WriteSet to help another thread
    WriteSet& operator = (const WriteSet &other) {
        numStores = other.numStores;
        for (uint64_t i = 0; i < numStores; i++) log[i] = other.log[i];
        return *this;
    }

    // Applies all entries in the log as DCASes.
    // Seq must match for DCAS to succeed. This method is on the "hot-path".
    inline void apply(uint64_t seq, const int tid) {
        for (uint64_t i = 0; i < numStores; i++) {
            // Use an heuristic to give each thread 8 consecutive DCAS to apply
            WriteSetEntry& e = log[(tid*8 + i) % numStores];
            tmtype<uint64_t>* tmte = (tmtype<uint64_t>*)e.addr;
            uint64_t lval = tmte->val.load(std::memory_order_acquire);
            uint64_t lseq = tmte->seq.load(std::memory_order_acquire);
            if (lseq < seq) DCAS((uint64_t*)e.addr, lval, lseq, e.val, seq);
        }
    }
};


// Forward declaration
struct OpData;
// This is used by addOrReplace() to know which OpData instance to use for the current transaction
extern thread_local OpData* tl_opdata;


// Its purpose is to hold thread-local data
struct OpData {
    uint64_t      curTx {0};              // Used during a transaction to keep the value of curTx read in beginTx() (owner thread only)
    uint64_t      nestedTrans {0};        // Thread-local: Number of nested transactions
    PWriteSet*    pWriteSet {nullptr};    // Pointer to the redo log in persistent memory
    uint64_t      padding[16-3];          // Padding to avoid false-sharing in nestedTrans and curTx
};


// Used to identify aborted transactions
struct AbortedTx {};
static constexpr AbortedTx AbortedTxException {};

class OneFileWF;
extern OneFileWF gOFWF;


/**
 * <h1> OneFile STM (Wait-Free) </h1>
 *
 * OneFile is a Software Transacional Memory with wait-free progress, meant to
 * implement wait-free data structures.
 * OneFile is a word-based STM and it uses double-compare-and-swap (DCAS).
 *
 * Right now it has several limitations, some will be fixed in the future, some may be hard limitations of this approach:
 * - We can't have stack allocated tmtype<> variables. For example, we can't created inside a transaction "tmtpye<uint64_t> tmp = a;",
 *   it will give weird errors because of stack allocation.
 * - We need DCAS but it can be emulated with LL/SC or even with single-word CAS
 *   if we do redirection to a (lock-free) pool with SeqPtrs;
 */
class OneFileWF {
private:
    static const bool                    debug = false;
    OpData                              *opData;
    int                                  fd {-1};
    HazardErasOF                         he {REGISTRY_MAX_THREADS};
    // Maximum number of times a reader will fail a transaction before turning into an updateTx()
    static const int                     MAX_READ_TRIES = 4;
    // Member variables for wait-free consensus
    tmtype<TransFunc*>*                  operations;  // We've tried adding padding here but it didn't make a difference
    tmtype<uint64_t>*                    results;
    uint64_t                             padding[16];
public:
    EsLoco<tmtype>                       esloco {};
    PMetadata*                           pmd {nullptr};
    std::atomic<uint64_t>*               curTx {nullptr};              // Pointer to persistent memory location of curTx (it's in PMetadata)
    WriteSet*                            writeSets;                    // Two write-sets for each thread

    OneFileWF() {
        opData = new OpData[REGISTRY_MAX_THREADS];
        writeSets = new WriteSet[REGISTRY_MAX_THREADS];
        operations = new tmtype<TransFunc*>[REGISTRY_MAX_THREADS];
        for (unsigned i = 0; i < REGISTRY_MAX_THREADS; i++) operations[i].operationsInit();
        results = new tmtype<uint64_t>[REGISTRY_MAX_THREADS];
        for (unsigned i = 0; i < REGISTRY_MAX_THREADS; i++) results[i].resultsInit();
        mapPersistentRegion(PFILE_NAME, PREGION_ADDR, PREGION_SIZE);
    }

    ~OneFileWF() {
        delete[] opData;
        delete[] writeSets;
        delete[] operations;
        delete[] results;
    }

    static std::string className() { return "OneFilePTM-WF"; }

    void mapPersistentRegion(const char* filename, uint8_t* regionAddr, const uint64_t regionSize) {
        // Check that the header with the logs leaves at least half the memory available to the user
        if (sizeof(PMetadata) > regionSize/2) {
            printf("ERROR: the size of the logs in persistent memory is so large that it takes more than half the whole persistent memory\n");
            printf("Please reduce some of the settings in OneFilePTM.hpp and try again\n");
            assert(false);
        }
        bool reuseRegion = false;
        // Check if the file already exists or not
        struct stat buf;
        if (stat(filename, &buf) == 0) {
            // File exists
            fd = open(filename, O_RDWR|O_CREAT, 0755);
            assert(fd >= 0);
            reuseRegion = true;
        } else {
            // File doesn't exist
            fd = open(filename, O_RDWR|O_CREAT, 0755);
            assert(fd >= 0);
            if (lseek(fd, regionSize-1, SEEK_SET) == -1) {
                perror("lseek() error");
            }
            if (write(fd, "", 1) == -1) {
                perror("write() error");
            }
        }
        // mmap() memory range
        void* got_addr = (uint8_t *)mmap(regionAddr, regionSize, (PROT_READ | PROT_WRITE), MAP_SHARED, fd, 0);
        if (got_addr == MAP_FAILED || got_addr != regionAddr) {
            printf("got_addr = %p  instead of %p\n", got_addr, regionAddr);
            perror("ERROR: mmap() is not working !!! ");
            assert(false);
        }
        // Check if the header is consistent and only then can we attempt to re-use, otherwise we clear everything that's there
        pmd = reinterpret_cast<PMetadata*>(regionAddr);
        if (reuseRegion) reuseRegion = (pmd->id == PMetadata::MAGIC_ID);
        // Map pieces of persistent Metadata to pointers in volatile memory
        for (uint64_t i = 0; i < REGISTRY_MAX_THREADS; i++) opData[i].pWriteSet = &(pmd->plog[i]);
        curTx = &(pmd->curTx);
        // If the file has just been created or if the header is not consistent, clear everything.
        // Otherwise, re-use and recover to a consistent state.
        if (reuseRegion) {
            esloco.init(regionAddr+sizeof(PMetadata), regionSize-sizeof(PMetadata), false);
            //recover(); // Not needed on x86
        } else {
            // Start by resetting all tmtypes::seq in the metadata region
            std::memset(regionAddr, 0, sizeof(PMetadata));
            new (regionAddr) PMetadata();
            esloco.init(regionAddr+sizeof(PMetadata), regionSize-sizeof(PMetadata), true);
            PFENCE();
            pmd->id = PMetadata::MAGIC_ID;
            PWB(&pmd->id);
            PFENCE();
        }
    }

    // My transaction was successful, it's my duty to cleanup any retired objects.
    // This is called by the owner thread when the transaction succeeds, to pass
    // the retired objects to Hazard Eras. We can't delete the objects
    // immediately because there might be other threads trying to apply our log
    // which may (or may not) contain addresses inside the objects in this list.
    void retireRetiresFromLog(OpData& myopd, const int tid) {
        uint64_t lseq = trans2seq(curTx->load(std::memory_order_acquire));
        // Start a cleaning phase, scanning to see which ones can be removed
        he.clean(lseq, tid);
    }

    // Progress condition: wait-free population-oblivious
    // Attempts to publish our write-set (commit the transaction) and then applies the write-set.
    // Returns true if my transaction was committed.
    inline bool commitTx(OpData& myopd, const int tid) {
        // If it's a read-only transaction, then commit immediately
        if (writeSets[tid].numStores == 0) return true;
        // Give up if the curTx has changed sinced our transaction started
        if (myopd.curTx != curTx->load(std::memory_order_acquire)) return false;
        // Move our request to OPEN, using the sequence of the previous transaction +1
        const uint64_t seq = trans2seq(myopd.curTx);
        const uint64_t newTx = seqidx2trans(seq+1,tid);
        myopd.pWriteSet->request.store(newTx, std::memory_order_release);
        // Copy the write-set to persistent memory and flush it
        writeSets[tid].persistAndFlushLog(myopd.pWriteSet);
        // Attempt to CAS curTx to our OpDesc instance (tid) incrementing the seq in it
        uint64_t lcurTx = myopd.curTx;
        if (debug) printf("tid=%i  attempting CAS on curTx from (%ld,%ld) to (%ld,%ld)\n", tid, trans2seq(lcurTx), trans2idx(lcurTx), seq+1, (uint64_t)tid);
        if (!curTx->compare_exchange_strong(lcurTx, newTx)) return false;
        PWB(curTx);
        // Execute each store in the write-set using DCAS() and close the request
        helpApply(newTx, tid);
        retireRetiresFromLog(myopd, tid);
        // We should need a PSYNC() here to provide durable linearizabilty, but the CAS of the state in helpApply() acts as a PSYNC() (on x86).
        if (debug) printf("Committed transaction (%ld,%ld) with %ld stores\n", seq+1, (uint64_t)tid, writeSets[tid].numStores);
        return true;
    }

    // Progress condition: wait-free (bounded by the number of threads)
    // Applies a mutative transaction or gets another thread with an ongoing
    // transaction to apply it.
    // If three 'seq' have passed since the transaction when we published our
    // function, then the worst-case scenario is: the first transaction does not
    // see our function; the second transaction transforms our function
    // but doesn't apply the corresponding write-set; the third transaction
    // guarantees that the log of the second transaction is applied.
    inline void innerUpdateTx(OpData& myopd, TransFunc* funcptr, const int tid) {
        ++myopd.nestedTrans;
        if (debug) printf("updateTx(tid=%d)\n", tid);
        // We need an era from before the 'funcptr' is announced, so as to protect it
        uint64_t firstEra = trans2seq(curTx->load(std::memory_order_acquire));
        operations[tid].rawStore(funcptr, results[tid].getSeq());
        tl_opdata = &myopd;
        // Check 3x for the completion of our function because we don't have a fence
        // on operations[tid].rawStore(), otherwise it would be just 2x.
        for (int iter = 0; iter < 4; iter++) {
            // An update transaction is read-only until it does the first store()
            tl_is_read_only = true;
            // Clear the logs of the previous transaction
            writeSets[tid].numStores = 0;
            myopd.curTx = curTx->load(std::memory_order_acquire);
            // Optimization: if my request is answered, then my tx is committed
            if (results[tid].getSeq() > operations[tid].getSeq()) break;
            helpApply(myopd.curTx, tid);
            // Reset the write-set after (possibly) helping another transaction complete
            writeSets[tid].numStores = 0;
            // Use HE to protect the TransFunc we're going to access
            he.set(myopd.curTx, tid);
            if (myopd.curTx != curTx->load()) continue;
            try {
                if (!transformAll(myopd.curTx, tid)) continue;
            } catch (AbortedTx&) {
                continue;
            }
            if (commitTx(myopd, tid)) break;
        }
        tl_opdata = nullptr;
        --myopd.nestedTrans;
        he.clear(tid);
        retireMyFunc(tid, funcptr, firstEra);
    }

    // Update transaction with non-void return value
    template<typename R, class F> static R updateTx(F&& func) {
        const int tid = ThreadRegistry::getTID();
        OpData& myopd = gOFWF.opData[tid];
        if (myopd.nestedTrans > 0) return func();
        // Copy the lambda to a std::function<> and announce a request with the pointer to it
        gOFWF.innerUpdateTx(myopd, new TransFunc([func] () { return (uint64_t)func(); }), tid);
        return (R)gOFWF.results[tid].pload();
    }

    // Update transaction with void return value
    template<class F> static void updateTx(F&& func) {
        const int tid = ThreadRegistry::getTID();
        OpData& myopd = gOFWF.opData[tid];
        if (myopd.nestedTrans > 0) {
            func();
            return;
        }
        // Copy the lambda to a std::function<> and announce a request with the pointer to it
        gOFWF.innerUpdateTx(myopd, new TransFunc([func] () { func(); return 0; }), tid);
    }

    // Progress condition: wait-free (bounded by the number of threads + MAX_READ_TRIES)
    template<typename R, class F> R readTransaction(F&& func) {
        const int tid = ThreadRegistry::getTID();
        OpData& myopd = opData[tid];
        if (myopd.nestedTrans > 0) return func();
        ++myopd.nestedTrans;
        tl_opdata = &myopd;
        tl_is_read_only = true;
        if (debug) printf("readTx(tid=%d)\n", tid);
        R retval {};
        writeSets[tid].numStores = 0;
        for (int iter = 0; iter < MAX_READ_TRIES; iter++) {
            myopd.curTx = curTx->load(std::memory_order_acquire);
            helpApply(myopd.curTx, tid);
            // Reset the write-set after (possibly) helping another transaction complete
            writeSets[tid].numStores = 0;
            // Use HE to protect the objects we're going to access during the simulation
            he.set(myopd.curTx, tid);
            if (myopd.curTx != curTx->load()) continue;
            try {
                retval = func();
            } catch (AbortedTx&) {
                continue;
            }
            --myopd.nestedTrans;
            tl_opdata = nullptr;
            he.clear(tid);
            return retval;
        }
        if (debug) printf("readTx() executed MAX_READ_TRIES, posing as updateTx()\n");
        --myopd.nestedTrans;
        // Tried too many times unsucessfully, pose as an updateTx()
        return updateTx<R>(func);
    }

    template<typename R, typename F> static R readTx(F&& func) { return gOFWF.readTransaction<R>(func); }
    template<typename F> static void readTx(F&& func) { gOFWF.readTransaction(func); }

    template <typename T, typename... Args> static T* tmNew(Args&&... args) {
    //template <typename T> static T* tmNew() {
        T* ptr = (T*)gOFWF.esloco.malloc(sizeof(T));
        //new (ptr) T;  // new placement
        new (ptr) T(std::forward<Args>(args)...);
        return ptr;
    }

    template<typename T> static void tmDelete(T* obj) {
        if (obj == nullptr) return;
        obj->~T(); // Execute destructor as part of the current transaction
        tmFree(obj);
    }

    static void* tmMalloc(size_t size) {
        if (tl_opdata == nullptr) {
            printf("ERROR: Can not allocate outside a transaction\n");
            return nullptr;
        }
        void* obj = gOFWF.esloco.malloc(size);
        return obj;
    }

    // We assume there is a tmbase allocated in the beginning of the allocation
    static void tmFree(void* obj) {
        if (obj == nullptr) return;
        if (tl_opdata == nullptr) {
            printf("ERROR: Can not de-allocate outside a transaction\n");
            return;
        }
        gOFWF.esloco.free(obj);
    }
    static void* pmalloc(size_t size) {
        return gOFWF.esloco.malloc(size);
    }

    static void pfree(void* obj) {
        if (obj == nullptr) return;
        gOFWF.esloco.free(obj);
    }

    template <typename T> static inline T* get_object(int idx) {
        tmtype<T*>* ptr = (tmtype<T*>*)&(gOFWF.pmd->rootPtrs[idx]);
        return ptr->pload();
    }

    template <typename T> static inline void put_object(int idx, T* obj) {
        tmtype<T*>* ptr = (tmtype<T*>*)&(gOFWF.pmd->rootPtrs[idx]);
        ptr->pstore(obj);
    }

private:
    // Progress condition: wait-free population oblivious
    inline void helpApply(uint64_t lcurTx, const int tid) {
        const uint64_t idx = trans2idx(lcurTx);
        const uint64_t seq = trans2seq(lcurTx);
        OpData& opd = opData[idx];
        // Nothing to apply unless the request matches the curTx
        if (lcurTx != opd.pWriteSet->request.load(std::memory_order_acquire)) return;
        if (idx != tid) {
            // Make a copy of the write-set and check if it is consistent
            writeSets[tid] = writeSets[idx];
            std::atomic_thread_fence(std::memory_order_acquire);
            if (lcurTx != curTx->load()) return;
            if (lcurTx != opd.pWriteSet->request.load(std::memory_order_acquire)) return;
        }
        if (debug) printf("Applying %ld stores in write-set\n", writeSets[tid].numStores);
        writeSets[tid].apply(seq, tid);
        writeSets[tid].flushModifications();
        if (opd.pWriteSet->request.load() == lcurTx) {
            const uint64_t newReq = seqidx2trans(seq+1,idx);
            opd.pWriteSet->request.compare_exchange_strong(lcurTx, newReq);
        }
    }

    inline void retireMyFunc(const int tid, TransFunc* myfunc, uint64_t firstEra) {
        myfunc->newEra_ = firstEra;
        myfunc->delEra_ = trans2seq(curTx->load(std::memory_order_acquire))+1; // Do we really need the +1 ?
        he.addToRetiredListTx(myfunc, tid);
    }

    // Aggregate all the functions of the different thread's writeTransaction()
    // and transform them into to a single log (the current thread's log).
    // Returns true if all active TransFunc were transformed
    inline bool transformAll(const uint64_t lcurrTx, const int tid) {
        for (unsigned i = 0; i < ThreadRegistry::getMaxThreads(); i++) {
            // Check if the operation of thread i has been applied (has a matching result)
            TransFunc* txfunc;
            uint64_t res, operationsSeq, resultSeq;
            if (!operations[i].rawLoad(txfunc, operationsSeq)) continue;
            if (!results[i].rawLoad(res, resultSeq)) continue;
            if (resultSeq > operationsSeq) continue;
            // Operation has not yet been applied, check that transaction identifier has not changed
            if (lcurrTx != curTx->load(std::memory_order_acquire)) return false;
            // Apply the operation of thread i and save result in results[i],
            // with this store being part of the transaction itself.
            results[i] = txfunc->func();
        }
        return true;
    }

    // Upon restart, re-applies the last transaction, so as to guarantee that
    // we have a consistent state in persistent memory.
    // This is not used on x86 because the DCAS has atomicity writting to persistent memory.
    void recover() {
        uint64_t lcurTx = curTx->load(std::memory_order_acquire);
        opData[trans2idx(lcurTx)].pWriteSet->applyFromRecover();
        PSYNC();
    }
};


//
// Wrapper methods to the global TM instance. The user should use these:
//
template<typename R, typename F> static R updateTx(F&& func) { return gOFWF.updateTx<R>(func); }
template<typename R, typename F> static R readTx(F&& func) { return gOFWF.readTx<R>(func); }
template<typename F> static void updateTx(F&& func) { gOFWF.updateTx(func); }
template<typename F> static void readTx(F&& func) { gOFWF.readTx(func); }
template<typename T, typename... Args> T* tmNew(Args&&... args) { return OneFileWF::tmNew<T>(args...); }
template<typename T> void tmDelete(T* obj) { OneFileWF::tmDelete<T>(obj); }
template<typename T> static T* get_object(int idx) { return OneFileWF::get_object<T>(idx); }
template<typename T> static void put_object(int idx, T* obj) { OneFileWF::put_object<T>(idx, obj); }
inline void* tmMalloc(size_t size) { return OneFileWF::tmMalloc(size); }
inline void tmFree(void* obj) { OneFileWF::tmFree(obj); }


// We have to check if there is a new ongoing transaction and if so, abort
// this execution immediately for two reasons:
// 1. Memory Reclamation: the val we're returning may be a pointer to an
// object that has since been retired and deleted, therefore we can't allow
// user code to de-reference it;
// 2. Invariant Conservation: The val we're reading may be from a newer
// transaction, which implies that it may break an invariant in the user code.
// See examples of invariant breaking in this post:
// http://concurrencyfreaks.com/2013/11/stampedlocktryoptimisticread-and.html
template<typename T> inline T tmtype<T>::pload() const {
    T lval = (T)val.load(std::memory_order_acquire);
    OpData* const myopd = tl_opdata;
    if (myopd == nullptr) return lval;
    if ((uint8_t*)this < PREGION_ADDR || (uint8_t*)this > PREGION_END) return lval;
    uint64_t lseq = seq.load(std::memory_order_acquire);
    if (lseq > trans2seq(myopd->curTx)) throw AbortedTxException;
    if (tl_is_read_only) return lval;
    return (T)gOFWF.writeSets[tl_tcico.tid].lookupAddr(this, (uint64_t)lval);
}

// This method is meant to be used by the internal consensus mechanism, not by the user.
// Returns true if the 'val' and 'seq' placed in 'keepVal' and 'keepSeq'
// are consistent, i.e. linearizabile. We need to use acquire-loads to keep
// order and re-check the 'seq' to make sure it corresponds to the 'val' we're returning.
template<typename T> inline bool tmtype<T>::rawLoad(T& keepVal, uint64_t& keepSeq) {
    keepSeq = seq.load(std::memory_order_acquire);
    keepVal = (T)val.load(std::memory_order_acquire);
    return (keepSeq == seq.load(std::memory_order_acquire));
}

// We don't need to check currTx here because we're not de-referencing
// the val. It's only after a load() that the val may be de-referenced
// (in user code), therefore we do the check on load() only.
template<typename T> inline void tmtype<T>::pstore(T newVal) {
    if (tl_opdata == nullptr) { // Looks like we're outside a transaction
        val.store((uint64_t)newVal, std::memory_order_relaxed);
    } else {
        gOFWF.writeSets[tl_tcico.tid].addOrReplace(this, (uint64_t)newVal);
    }
}


//
// Place these in a .cpp if you include this header from multiple files (compilation units)
//
OneFileWF gOFWF {};
thread_local OpData* tl_opdata {nullptr};
// Global/singleton to hold all the thread registry functionality
ThreadRegistry gThreadRegistry {};
// During a transaction, this is true up until the first store()
thread_local bool tl_is_read_only {false};
// This is where every thread stores the tid it has been assigned when it calls getTID() for the first time.
// When the thread dies, the destructor of ThreadCheckInCheckOut will be called and de-register the thread.
thread_local ThreadCheckInCheckOut tl_tcico {};
// Helper function for thread de-registration
void thread_registry_deregister_thread(const int tid) {
    gThreadRegistry.deregister_thread(tid);
}

}
#endif /* _PERSISTENT_ONE_FILE_WAIT_FREE_TRANSACTIONAL_MEMORY_WITH_H_ */


================================================
FILE: ptms/PMDKTM.hpp
================================================
#ifndef _PMDK_TM_PERSISTENCY_
#define _PMDK_TM_PERSISTENCY_

#define PMDK_STM

#ifdef PMDK_STM
#include <shared_mutex>         // You can comment this out if you use instead our C-RW-WP reader-writer lock
#include <libpmemobj++/p.hpp>
#include <libpmemobj++/transaction.hpp>
#include <libpmemobj++/pool.hpp>
#include <libpmemobj++/allocator.hpp>
#endif

namespace pmdk {

#ifdef PMDK_STM

using namespace pmem::obj;

auto gpop = pool_base::create("/dev/shm/pmdk_shared", "", (size_t)(400*1024*1024));

std::shared_timed_mutex grwlock {};

thread_local int tl_nested_write_trans {0};
thread_local int tl_nested_read_trans {0};

#endif

// Ugly hack just to make PMDK work with our root pointers
static void* g_objects[100];

/*
 * <h1> Wrapper for libpmemobj from pmem.io </h1>
 *
 * http://pmem.io/pmdk/cpp_obj/
 *
 */
class PMDKTM {

public:
    PMDKTM()  {
        for (int i = 0; i < 100; i++) g_objects[i] = nullptr;

    }

    ~PMDKTM() { }


    static std::string className() { return "PMDK"; }


    template <typename T>
    static inline T* get_object(int idx) {
        return (T*)g_objects[idx];  // TODO: fix me
    }

    template <typename T>
    static inline void put_object(int idx, T* obj) {
        g_objects[idx] = obj;  // TODO: fix me
        //PWB(&per->objects[idx]);
    }


    inline void begin_transaction() {
    }

    inline void end_transaction() {
    }

    inline void recover_if_needed() {
    }

    inline void abort_transaction(void) {
    }


    template<class F> static void transaction(F&& func) {
#ifdef PMDK_STM
        transaction::run(gpop, func);
#endif
    }

    template<class F> static void updateTx(F&& func) {
#ifdef PMDK_STM
        if (tl_nested_write_trans > 0) {
            transaction::run(gpop, func);
            return;
        }
        ++tl_nested_write_trans;
        grwlock.lock();
        transaction::run(gpop, func);
        grwlock.unlock();
        --tl_nested_write_trans;
#endif
    }

    template<class F> static void readTx(F&& func) {
#ifdef PMDK_STM
        if (tl_nested_read_trans > 0) {
            transaction::run(gpop, func);
            return;
        }
        ++tl_nested_read_trans;
        grwlock.lock_shared();
        transaction::run(gpop, func);
        grwlock.unlock_shared();
        --tl_nested_read_trans;
#endif
    }


    /*
     * Allocator
     * Must be called from within a transaction
     */
    template <typename T, typename... Args>
    static T* tmNew(Args&&... args) {
        void *addr = nullptr;
#ifdef PMDK_STM
        auto oid = pmemobj_tx_alloc(sizeof(T), 0);
        addr = pmemobj_direct(oid);
#endif
        return new (addr) T(std::forward<Args>(args)...); // placement new
    }


    /*
     * De-allocator
     * Must be called from within a transaction
     */
    template<typename T>
    static void tmDelete(T* obj) {
#ifdef PMDK_STM
        if (obj == nullptr) return;
        obj->~T();
        pmemobj_tx_free(pmemobj_oid(obj));
#endif
    }

    /* Allocator for C methods */
    static void* pmalloc(size_t size) {
        void* ptr = nullptr;
#ifdef PMDK_STM
        auto oid = pmemobj_tx_alloc(size, 0);
        ptr = pmemobj_direct(oid);
#endif
        return ptr;
    }


    /* De-allocator for C methods (like memcached) */
    static void pfree(void* ptr) {
#ifdef PMDK_STM
        pmemobj_tx_free(pmemobj_oid(ptr));
#endif
    }

    // Doesn't actually do any checking. That functionality exists only for RomulusLog and RomulusLR
    static bool consistency_check(void) {
        return true;
    }


    // TODO: Remove these two once we make CX have void transactions
    template<typename R,class F>
    inline static R readTx(F&& func) {
        readTx( [&]() {func();} );
        return R{};
    }
    template<typename R,class F>
    inline static R updateTx(F&& func) {
        updateTx( [&]() {func();} );
        return R{};
    }
};


/*
 * Definition of persist<> type
 */
template<typename T>
struct persist {
#ifdef PMDK_STM
    // Stores the actual value
    pmem::obj::p<T> val {};   // This is where the magic happens in libpmemobj
#else
    T val {};
#endif
    persist() { }

    persist(T initVal) {
        pstore(initVal);
    }

    // Casting operator
    operator T() {
        return pload();
    }

    // Casting to const
    operator T() const {
        return pload();
    }

    // Prefix increment operator: ++x
    void operator++ () {
        pstore(pload()+1);
    }

    // Prefix decrement operator: --x
    void operator-- () {
        pstore(pload()-1);
    }

    void operator++ (int) {
        pstore(pload()+1);
    }

    void operator-- (int) {
        pstore(pload()-1);
    }

    // Equals operator: first downcast to T and then compare
    bool operator == (const T& otherval) const {
        return pload() == otherval;
    }

    // Difference operator: first downcast to T and then compare
    bool operator != (const T& otherval) const {
        return pload() != otherval;
    }

    // Relational operators
    bool operator < (const T& rhs) {
        return pload() < rhs;
    }
    bool operator > (const T& rhs) {
        return pload() > rhs;
    }
    bool operator <= (const T& rhs) {
        return pload() <= rhs;
    }
    bool operator >= (const T& rhs) {
        return pload() >= rhs;
    }

    T operator % (const T& rhs) {
        return pload() % rhs;
    }

    // Operator arrow ->
    T operator->() {
        return pload();
    }

    // Operator &
    T* operator&() {
#ifdef PMDK_STM
        return (T*)&val.get_ro();  // tsc, tsc: bad way to take away constness, but p<> is inflexible
#else
        return &val;
#endif
    }

    // Copy constructor
    persist<T>(const persist<T>& other) {
        pstore(other.pload());
    }

    // Assignment operator from an atomic_mwc
    persist<T>& operator=(const persist<T>& other) {
        pstore(other.pload());
        return *this;
    }

    // Assignment operator from a value
    persist<T>& operator=(T value) {
        pstore(value);
        return *this;
    }

    persist<T>& operator&=(T value) {
        pstore(pload() & value);
        return *this;
    }

    persist<T>& operator|=(T value) {
        pstore(pload() | value);
        return *this;
    }
    persist<T>& operator+=(T value) {
        pstore(pload() + value);
        return *this;
    }
    persist<T>& operator-=(T value) {
        pstore(pload() - value);
        return *this;
    }

    inline void pstore(T newVal) {
        val = newVal;
    }

    inline T pload() const {
        return val;
    }
};


} // end of pmdk namespace

#undef PMDK_STM

#endif   // _PMDK_TM_PERSISTENCY_


================================================
FILE: ptms/README.md
================================================
We have in here the PTMs (Persistent Transactional Memories) and their wrappers.

### PMDK ###
Also known as NVML, it is undo-log based.
Uses a regular pthread_rwlock_t for concurrency.
Blocking progress.
You need to install this library from github.

### RomulusLog ###
For persistence, uses the Romulus technique with volatile redo log.
Uses a C-RW-WP reader-writer lock with integrated flat-combining.
Blocking with starvation-free progress for writers.
Uses 0x7fdd40000000 by default as the mapping address.

### RomulusLR ###
For persistence, uses the Romulus technique with volatile redo log.
For concurrency, uses the Left-Right universal construct with integrated flat-combining.
Blocking with starvation-free progress for writers and wait-free population oblivious for readers.
Uses 0x7fdd80000000 by default as the mapping address.

### PTM OneFile Lock-Free ###
Implementation of the OneFile STM (Lock-Free) with persistent memory support. Does 2 fences per transaction.
This is redo-log based and it uses EsLoco memory allocator.
Has lock-free progress for all transactions.
Does not use pfences.h
Uses 0x7fea00000000 by default as the mapping address.

### PTM OneFile Wait-Free ###
Implementation of the OneFile STM (Wait-Free) with persistent memory support. Does 2 fences per transaction.
This is redo-log based and it uses EsLoco memory allocator.
Has wait-free bounded progress for all transactions.
Does not use pfences.h
Uses 0x7feb00000000 by default as the mapping address.

The pfences.h file contains the definitions of the PWB(), PFENCE() and PSYNC() macros for Romulus, depending on the target cpu.
PMDK detects these at runtime, using the best possible one.
The concept of pwb/pfence/psync comes from the paper "Linearizability of Persistent Memory Objects Under a Full-System-Crash Failure Model" by Joseph Izraelevitz, Hammurabi Mendes, Michael L. Scott.


================================================
FILE: ptms/atlas/README.md
================================================
Atlas doesn't provide durable linearizable transactions (ACID) and therefore, it is not directly comparable with the other PTMs.
Also, it's a bit of a pain to get working because it requires a patch to the compiler. If you really want to see for yourself, here it is.

================================================
FILE: ptms/atlas/atlas.patch
================================================
diff --git a/OnefileReadme.md b/OnefileReadme.md
new file mode 100644
index 0000000..6e6920d
--- /dev/null
+++ b/OnefileReadme.md
@@ -0,0 +1,5 @@
+# Building Atlas
+
+1. Build plugin. cd compiler-plugin; make
+2. Build Atlas. cd runtime; mkdir build; cd build; cmake ..; make -j
+
diff --git a/compiler-plugin/Makefile b/compiler-plugin/Makefile
new file mode 100644
index 0000000..ac81d22
--- /dev/null
+++ b/compiler-plugin/Makefile
@@ -0,0 +1,7 @@
+
+all: plugin
+
+plugin: plugin_build/NvmInstrumenter.so
+
+plugin_build/NvmInstrumenter.so: src/Instrumentation/NvmInstrumenter.cpp
+	./build_plugin
\ No newline at end of file
diff --git a/compiler-plugin/build_plugin b/compiler-plugin/build_plugin
index 0cde572..dc38e9e 100755
--- a/compiler-plugin/build_plugin
+++ b/compiler-plugin/build_plugin
@@ -15,6 +15,8 @@
 # <http://www.gnu.org/licenses/>.
 #
 
+#etc_flags="-stdlib=libstdc++ -D_GLIBCXX_USE_CXX11_ABI=0 -fno-rtti"
+
 logfile="build_log.txt"
 if [ -f "$logfile" ]; then
     rm $logfile
@@ -28,7 +30,7 @@ if [ ! -f "$srcfile" ]; then
     echo "Could not find plugin source file NvmInstrumenter.cpp - are you running from within Atlas/compiler-plugin?"
     exit 1
 fi
-clangpppath=$(which clang++; reval="$?")
+clangpppath=$(which clang; reval="$?")
 if [ "$?" -ne 0 ]; then
     echo "Could not find a copy of clang++, is it installed or added to PATH?"
     exit 1
@@ -43,19 +45,21 @@ else
     echo "Found llvm-config in $llvmconfigpath"
 fi
 echo "Compiling object files" | tee $logfile
-timeout 300s clang++ -c $srcfile `llvm-config --cxxflags` >> $logfile 2>&1
+
+timeout 300s clang -c $srcfile -g `llvm-config --cxxflags` >> $logfile 2>&1
 retval="$?"
 if [ "$retval" == "124" ]; then
     echo "Compilation took longer than 5 minutes - have you got conflicting versions of llvmretval Try building with the linked script."
     exit 1
 elif [ "$retval" -ne 0 ]; then
     echo "Build shared lib failed on compilation, check $logfile"
+    cat $logfile
     exit 1
 else
     echo "Compilation successful"
 fi
 echo "Linking" | tee $logfile
-timeout 300s clang++ -shared NvmInstrumenter.o -o NvmInstrumenter.so >> $logfile 2>&1
+timeout 300s clang `llvm-config --cxxflags` -g -shared NvmInstrumenter.o -o NvmInstrumenter.so >> $logfile 2>&1
 retval="$?"
 if [ "$retval" == "124" ]; then
     echo "Linking took longer than 5 minutes - have you got conflicting versions of llvmretval Try building with the linked script."
diff --git a/runtime/src/consistency/helper_driver.cpp b/runtime/src/consistency/helper_driver.cpp
index f34c8f2..e8a5d6a 100644
--- a/runtime/src/consistency/helper_driver.cpp
+++ b/runtime/src/consistency/helper_driver.cpp
@@ -24,6 +24,17 @@
 #include "log_mgr.hpp"
 #include "consistency_mgr.hpp"
 
+#include <atomic>
+std::atomic<bool> waitingHelperFinishFlag;
+//synchronize with helper thread. 
+void synchronizeWithHelper(){
+	waitingHelperFinishFlag.store(true);
+	while(waitingHelperFinishFlag.load()){
+		Atlas::LogMgr::getInstance().signalLogReady();
+		;
+	}
+}
+
 namespace Atlas {
     
 uint64_t removed_log_count = 0;
@@ -135,6 +146,10 @@ void Helper::doConsistentUpdate(void *arg_lsp)
             CSMgr::deleteInstance();
             break;
         }
+        if(waitingHelperFinishFlag.load() && !cs_mgr.get_num_graph_vertices()){
+        	//helper finished. Now signal application it can continue.
+			waitingHelperFinishFlag.store(false);
+        }
         
         CSMgr::deleteInstance();
 
diff --git a/runtime/tests/data_structures/CMakeLists.txt b/runtime/tests/data_structures/CMakeLists.txt
index f02cc09..13367d8 100644
--- a/runtime/tests/data_structures/CMakeLists.txt
+++ b/runtime/tests/data_structures/CMakeLists.txt
@@ -16,7 +16,7 @@
 
 set (EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/tests/data_structures)
 
-set (DS_ATLAS_TGTS alarm_clock_nvm queue_nvm stores_nvm cow_array_list_nvm sll_nvm sll_mt_ll sll_ll)
+set (DS_ATLAS_TGTS alarm_clock_nvm queue_nvm stores_nvm cow_array_list_nvm sll_nvm sll_mt_ll sll_ll sps_nvm)
 set (DS_NOATLAS_TGTS cow_array_list queue sll sll_mt stores alarm_clock)
 set (DS_ALL_TGTS ${DS_ATLAS_TGTS} ${DS_NOATLAS_TGTS})
 
diff --git a/runtime/tests/data_structures/PBenchmarkSPS.hpp b/runtime/tests/data_structures/PBenchmarkSPS.hpp
new file mode 100644
index 0000000..f6b4873
--- /dev/null
+++ b/runtime/tests/data_structures/PBenchmarkSPS.hpp
@@ -0,0 +1,226 @@
+#ifndef _PERSISTENT_BENCHMARK_SPS_H_
+#define _PERSISTENT_BENCHMARK_SPS_H_
+
+#include <atomic>
+#include <chrono>
+#include <thread>
+#include <string>
+#include <vector>
+#include <algorithm>
+#include <cassert>
+#include <iostream>
+#include <typeinfo>
+#include "atlas_api.h"
+#include "atlas_alloc.h"
+#include <pthread.h>
+
+static const int arraySize=200000;
+static const int MAX_THREADS=64;
+static const int MAX_RUNS = 10;
+
+using namespace std;
+using namespace chrono;
+
+extern unsigned nvm_region_id;
+
+static const int LOCK_BUCKET_SIZE = 1;
+static pthread_mutex_t locks[arraySize/LOCK_BUCKET_SIZE] = {PTHREAD_MUTEX_INITIALIZER};
+/**
+ * This is a micro-benchmark with integer swaps (SPS) for PTMs
+ */
+class PBenchmarkSPS {
+
+private:
+	int numThreads;
+
+public:
+	struct UserData  {
+		long long seq;
+		int tid;
+		UserData(long long lseq, int ltid) {
+			this->seq = lseq;
+			this->tid = ltid;
+		}
+		UserData() {
+			this->seq = -2;
+			this->tid = -2;
+		}
+		UserData(const UserData &other) : seq(other.seq), tid(other.tid) { }
+
+		bool operator < (const UserData& other) const {
+			return seq < other.seq;
+		}
+	};
+
+	PBenchmarkSPS(int numThreads) {
+		this->numThreads = numThreads;
+	}
+
+	struct sync_struct{
+		atomic<bool> startFlag = { false };
+		atomic<bool> quit = { false };
+		long numSwapsPerTx;
+		sync_struct(long numSwapsPerTxParam):startFlag(false),quit(false),numSwapsPerTx(numSwapsPerTxParam){}
+	};
+	static void thread_driver_fine_grained(sync_struct *sync, long long *ops, const int tid){
+		uint64_t seed = tid+1234567890123456781ULL;
+		int larraySize = arraySize;
+		long numSwapsPerTx = sync->numSwapsPerTx;
+		vector<uint64_t> pairs(numSwapsPerTx*2);
+		// Spin until the startFlag is set
+		while (!sync->startFlag.load()) {}
+		// Do transactions until the quit flag is set
+		long long tcount = 0;
+		while (!sync->quit.load()) {
+			uint64_t *parray = (uint64_t*)NVM_GetRegionRoot(nvm_region_id);
+			for (int i = 0; i < numSwapsPerTx; i++) {
+				seed = randomLong(seed);
+				auto ia = seed%larraySize;
+				seed = randomLong(seed);
+				auto ib = seed%larraySize;
+				pairs[i*2]=ia;
+				pairs[i*2+1]=ib;
+			}
+			vector<uint64_t> lcks(pairs);
+			std::sort(lcks.begin(), lcks.end());
+			auto end = std::unique(lcks.begin(),lcks.end());
+			for(auto it = lcks.begin(); it!=end; ++it)
+				pthread_mutex_lock(&locks[*it]);
+			for (int i = 0; i < numSwapsPerTx; i++) {
+				auto ia=pairs[2*i], ib=pairs[2*i+1];
+				uint64_t tmp = parray[ia];
+				parray[ia] = parray[ib];
+				parray[ib] = tmp;
+			}
+			for(auto it = lcks.begin(); it!=end; ++it)
+				pthread_mutex_unlock(&locks[*it]);
+
+			++tcount;
+			/*
+	                PE::read_transaction([this,&seed,&parray,&numWordsPerTransaction] () {
+	                    PersistentArrayInt<persist>* read_array = PE::template get_object<PersistentArrayInt<persist>>(PIDX_INT_ARRAY);
+	                    // Check that the array is consistent
+	                    int sum = 0;
+	                    for (int i = 0; i < arraySize; i++) {
+	                        sum += read_array->counters[i];
+	                    }
+	                    assert(sum == 0);
+	                } );
+			 */
+		}
+		*ops = tcount;
+	}
+	static void thread_driver(sync_struct *sync, long long *ops, const int tid){
+		uint64_t seed = tid+1234567890123456781ULL;
+		int larraySize = arraySize;
+		long numSwapsPerTx = sync->numSwapsPerTx;
+		// Spin until the startFlag is set
+		while (!sync->startFlag.load()) {}
+		// Do transactions until the quit flag is set
+		long long tcount = 0;
+		while (!sync->quit.load()) {
+			pthread_mutex_lock(&locks[0]);
+			uint64_t *parray = (uint64_t*)NVM_GetRegionRoot(nvm_region_id);
+			for (int i = 0; i < numSwapsPerTx; i++) {
+				seed = randomLong(seed);
+				auto ia = seed%larraySize;
+				uint64_t tmp = parray[ia];
+				seed = randomLong(seed);
+				auto ib = seed%larraySize;
+				parray[ia] = parray[ib];
+				parray[ib] = tmp;
+			}
+			pthread_mutex_unlock(&locks[0]);
+			++tcount;
+			/*
+                PE::read_transaction([this,&seed,&parray,&numWordsPerTransaction] () {
+                    PersistentArrayInt<persist>* read_array = PE::template get_object<PersistentArrayInt<persist>>(PIDX_INT_ARRAY);
+                    // Check that the array is consistent
+                    int sum = 0;
+                    for (int i = 0; i < arraySize; i++) {
+                        sum += read_array->counters[i];
+                    }
+                    assert(sum == 0);
+                } );
+			 */
+		}
+		*ops = tcount;
+	}
+
+	/*
+	 * An array of integers that gets randomly permutated.
+	 */
+	//template<typename PTM, template<typename> class PERSIST>
+	uint64_t benchmarkSPSInteger(std::string& className, const seconds testLengthSeconds, const long numSwapsPerTx, const int numRuns) {
+		long long ops[MAX_THREADS][MAX_RUNS];
+		long long lengthSec[MAX_RUNS];
+		assert(numThreads <= MAX_THREADS);
+		assert(numRuns <= MAX_RUNS);
+		//atomic<bool> startFlag = { false };
+		//atomic<bool> quit = { false };
+		sync_struct sync{numSwapsPerTx};
+
+		// Create the array of integers and initialize it, saving it in root pointer 0
+		int larraySize = arraySize;
+		NVM_BEGIN_DURABLE();
+		void *arr = nvm_alloc((unsigned long)(larraySize*sizeof(uint64_t)), nvm_region_id);
+		NVM_SetRegionRoot(nvm_region_id, arr);
+		NVM_END_DURABLE();
+
+		for (int irun = 0; irun < numRuns; irun++) {
+			if (irun == 0) {
+				//className = PTM::className();
+				cout << "##### " << className << " #####  \n";
+			}
+			thread enqdeqThreads[MAX_THREADS];
+			for (int tid = 0; tid < numThreads; tid++)
+				//enqdeqThreads[tid] = thread(thread_driver_fine_grained, &sync, &ops[tid][irun], tid);
+				enqdeqThreads[tid] = thread(thread_driver, &sync, &ops[tid][irun], tid);
+			auto startBeats = steady_clock::now();
+			sync.startFlag.store(true);
+			// Sleep for 20 seconds
+			this_thread::sleep_for(testLengthSeconds);
+			sync.quit.store(true);
+			auto stopBeats = steady_clock::now();
+			for (int tid = 0; tid < numThreads; tid++) enqdeqThreads[tid].join();
+			lengthSec[irun] = (stopBeats-startBeats).count();
+			sync.startFlag.store(false);
+			sync.quit.store(false);
+		}
+
+		NVM_BEGIN_DURABLE();
+		nvm_free(NVM_GetRegionRoot(nvm_region_id));
+		NVM_SetRegionRoot(nvm_region_id, nullptr);
+		NVM_END_DURABLE();
+
+		// Accounting
+		vector<long long> agg(numRuns);
+		for (int irun = 0; irun < numRuns; irun++) {
+			for(int i=0;i<numThreads;i++){
+				agg[irun] += ops[i][irun]*1000000000LL/lengthSec[irun];
+			}
+		}
+		// Compute the median. numRuns should be an odd number
+		sort(agg.begin(),agg.end());
+		auto maxops = agg[numRuns-1];
+		auto minops = agg[0];
+		auto medianops = agg[numRuns/2];
+		auto delta = (long)(100.*(maxops-minops) / ((double)medianops));
+		// Printed value is the median of the number of ops per second that all threads were able to accomplish (on average)
+		std::cout << "Swaps/sec = " << medianops*numSwapsPerTx << "     delta = " << delta*numSwapsPerTx << "%   min = " << minops*numSwapsPerTx << "   max = " << maxops*numSwapsPerTx << "\n";
+		return medianops*numSwapsPerTx;
+	}
+
+
+	/**
+	 * An imprecise but fast random number generator
+	 */
+	static uint64_t randomLong(uint64_t x) {
+		x ^= x >> 12; // a
+		x ^= x << 25; // b
+		x ^= x >> 27; // c
+		return x * 2685821657736338717LL;
+	}
+};
+
+#endif
diff --git a/runtime/tests/data_structures/sps_nvm.cpp b/runtime/tests/data_structures/sps_nvm.cpp
new file mode 100644
index 0000000..2e247c1
--- /dev/null
+++ b/runtime/tests/data_structures/sps_nvm.cpp
@@ -0,0 +1,113 @@
+/*
+ * This benchmark executes SPS for the following PTMs:
+ * - RomulusLog
+ * - RomulusLR
+ * - PMDK
+ * - OneFilePTM-LF (lock-free)
+ * - OneFilePTM-WF (wait-free bounded)
+ */
+#include <iostream>
+#include <fstream>
+#include <cstring>
+#include <vector>
+#include <chrono>
+#include <atomic>
+#include <chrono>
+#include <thread>
+#include <string>
+#include <vector>
+#include <algorithm>
+#include <cassert>
+#include <iostream>
+#include <typeinfo>
+#include "PBenchmarkSPS.hpp"
+
+using namespace std;
+using namespace chrono;
+//#include "benchmarks/PBenchmarkSPS.hpp"
+//#include "ptms/atlas/AtlasPTM.hpp"
+
+unsigned nvm_region_id;
+
+void AtlasInit(){
+    NVM_Initialize();
+    nvm_region_id = NVM_FindOrCreateRegion("SPS_onefile", O_RDWR, NULL);
+}
+void AtlasFinish(){
+    NVM_CloseRegion(nvm_region_id);
+    // Optionally print Atlas stats
+#ifdef NVM_STATS
+    NVM_PrintStats();
+#endif
+    // Atlas bookkeeping
+    NVM_Finalize();
+}
+void synchronizeWithHelper();
+int main(void) {
+    AtlasInit();
+
+    const std::string dataFilename {"data/psps-integer-atlas.txt"};
+    vector<int> threadList = { 1, 2, 4, 8, 16, 32 };         // For the laptop
+    //vector<int> threadList = { 1, 2, 4, 8, 16, 24, 32 };   // For Cervino or AWS
+    vector<long> swapsPerTxList = { 1, 4, 8, 16, 32, 64, 128, 256 };
+    const int numRuns = 1;                                   // 5 runs for the paper
+    const seconds testLength(2);                          // 20s for the paper
+    const int EMAX_CLASS = 10;
+    unsigned maxClass = 0;
+    vector<vector<vector<uint64_t>>> results(EMAX_CLASS,
+    		vector<vector<uint64_t>>(threadList.size(),
+    				vector<uint64_t>(swapsPerTxList.size(),0)));
+
+    //uint64_t results[EMAX_CLASS][threadList.size()][swapsPerTxList.size()];
+    std::string cNames[EMAX_CLASS]={"ATLAS"};
+    // Reset results
+    //std::memset(results, 0, sizeof(uint64_t)*EMAX_CLASS*threadList.size()*swapsPerTxList.size());
+
+
+    // SPS Benchmarks multi-threaded
+    std::cout << "\n----- Persistent SPS Benchmark (multi-threaded integer array swap) -----\n";
+    for (unsigned it = 0; it < threadList.size(); it++) {
+        int nThreads = threadList[it];
+        for (unsigned is = 0; is < swapsPerTxList.size(); is++) {
+            int nWords = swapsPerTxList[is];
+            int ic = 0;
+            PBenchmarkSPS bench(nThreads);
+            std::cout << "\n----- threads=" << nThreads << "   runs=" << numRuns << "   length=" << testLength.count() << "s   arraySize=" << arraySize << "   swaps/tx=" << nWords << " -----\n";
+            results[ic][it][is] = bench.benchmarkSPSInteger   (cNames[ic], testLength, nWords, numRuns);
+            ic++;
+            maxClass = ic;
+            synchronizeWithHelper();
+        }
+        std::cout << "\n";
+    }
+
+    AtlasFinish();
+
+    // Export tab-separated values to a file to be imported in gnuplot or excel
+    ofstream dataFile;
+    dataFile.open(dataFilename);
+    dataFile << "Swaps\t";
+    // Printf class names for each column plus the corresponding thread
+    for (unsigned iclass = 0; iclass < maxClass; iclass++) {
+        for (unsigned ithread = 0; ithread < threadList.size(); ithread++) {
+            int nThreads = threadList[ithread];
+            dataFile << cNames[iclass] << "-" << nThreads <<"T\t";
+        }
+    }
+    dataFile << "\n";
+    for (unsigned iswaps = 0; iswaps < swapsPerTxList.size(); iswaps++) {
+        dataFile << swapsPerTxList[iswaps] << "\t";
+        for (unsigned iclass = 0; iclass < maxClass; iclass++) {
+            for (unsigned ithread = 0; ithread < threadList.size(); ithread++) {
+                dataFile << results[iclass][ithread][iswaps] << "\t";
+            }
+        }
+        dataFile << "\n";
+    }
+    dataFile.close();
+    std::cout << "\nSuccessfuly saved results in " << dataFilename << "\n";
+
+
+
+    return 0;
+}


================================================
FILE: ptms/romuluslog/RomulusLog.cpp
================================================

#include "RomulusLog.hpp"

namespace romuluslog {

// Global with the 'main' size. Used by pload()
uint64_t g_main_size = 0;
// Global with the 'main' addr. Used by pload()
uint8_t* g_main_addr = 0;

// Counter of nested write transactions
thread_local int64_t tl_nested_write_trans = 0;
// Counter of nested read-only transactions
thread_local int64_t tl_nested_read_trans = 0;
bool histoOn = false;
bool histoflag = false;
RomulusLog gRomLog {};

/*
 * <h1> Romulus Log </h1>
* TODO: explain this...
*
*
*
*/
#ifdef USE_ESLOCO
#else
mspace create_mspace_with_base(void* base, size_t capacity, int locked);
#endif
//
// Private methods
//

// Copy the data from 'main' to 'back'
void RomulusLog::copyMainToBack() {
    uint64_t size = std::min(per->used_size,g_main_size);
    std::memcpy(back_addr, main_addr, size);
    flush_range(back_addr, size);
}

// Copy the data from 'back' to 'main'
void RomulusLog::copyBackToMain() {
    uint64_t size = std::min(per->used_size,g_main_size);
    std::memcpy(main_addr, back_addr, size);
    flush_range(main_addr, size);
}

RomulusLog::RomulusLog() : dommap{true},maxThreads{128} {
    fc = new std::atomic< std::function<void()>* >[maxThreads*CLPAD];
    for (int i = 0; i < maxThreads; i++) {
        fc[i*CLPAD].store(nullptr, std::memory_order_relaxed);
    }
    // Filename for the mapping file
    if (dommap) {
        base_addr = (uint8_t*)0x7fdd40000000;
        max_size = PM_REGION_SIZE;
        // Check if the file already exists or not
        struct stat buf;
        if (stat(MMAP_FILENAME, &buf) == 0) {
            // File exists
            //std::cout << "Re-using memory region\n";
            fd = open(MMAP_FILENAME, O_RDWR|O_CREAT, 0755);
            assert(fd >= 0);
            // mmap() memory range
            uint8_t* got_addr = (uint8_t *)mmap(base_addr, max_size, (PROT_READ | PROT_WRITE), MAP_SHARED_VALIDATE | PM_FLAGS, fd, 0);
            if (got_addr == MAP_FAILED || got_addr != base_addr) {
                perror("ERROR: mmap() is not working !!! ");
                printf("got_addr = %p instead of %p\n", got_addr, base_addr);
                assert(false);
            }
            per = reinterpret_cast<PersistentHeader*>(base_addr);
            if (per->id != MAGIC_ID) createFile();
            g_main_size = (max_size - sizeof(PersistentHeader))/2;
            main_addr = base_addr + sizeof(PersistentHeader);
            back_addr = main_addr + g_main_size;
            g_main_addr = main_addr;
            recover();
        } else {
            createFile();
        }
    }
}


RomulusLog::~RomulusLog() {
    delete[] fc;
    // Must do munmap() if we did mmap()
    if (dommap) {
        //destroy_mspace(ms);
        munmap(base_addr, max_size);
        close(fd);
    }
    if(histoflag){
    	for(int i=0;i<300;i++){
    		std::cout<<i<<":"<<histo[i]<<"\n";
    	}
    }
}

void RomulusLog::createFile(){
    // File doesn't exist
    fd = open(MMAP_FILENAME, O_RDWR|O_CREAT, 0755);
    assert(fd >= 0);
    if (lseek(fd, max_size-1, SEEK_SET) == -1) {
        perror("lseek() error");
    }
    if (write(fd, "", 1) == -1) {
        perror("write() error");
    }
    // mmap() memory range
    uint8_t* got_addr = (uint8_t *)mmap(base_addr, max_size, (PROT_READ | PROT_WRITE), MAP_SHARED_VALIDATE | PM_FLAGS, fd, 0);
    if (got_addr == MAP_FAILED || got_addr != base_addr) {
        perror("ERROR: mmap() is not working !!! ");
        printf("got_addr = %p instead of %p\n", got_addr, base_addr);
        assert(false);
    }
    // No data in persistent memory, initialize
    per = new (base_addr) PersistentHeader;
    g_main_size = (max_size - sizeof(PersistentHeader))/2;
    main_addr = base_addr + sizeof(PersistentHeader);
    back_addr = main_addr + g_main_size;
    g_main_addr = main_addr;
    PWB(&per->id);
    PWB(&per->state);
    // We need to call create_mspace_with_base() from within a transaction so that
    // the modifications on 'main' get replicated on 'back'. This means we temporarily
    // need to set the 'used_size' to 'main_size' to make sure everything is copied.
    begin_transaction();
    // Just to force the copy of the whole main region
    per->used_size = g_main_size;
#ifdef USE_ESLOCO
    esloco = new EsLoco<persist>(main_addr, g_main_size, false);
    per->objects = (void**)esloco->malloc(sizeof(void*)*100);
#else
    per->ms = create_mspace_with_base(main_addr, g_main_size, false);
    per->objects = (void**)mspace_malloc(per->ms, sizeof(void*)*100);
#endif
    for (int i = 0; i < 100; i++) {
        per->objects[i] = nullptr;
        add_to_log(&per->objects[i],sizeof(void*));
        PWB(&per->objects[i]);
    }
    end_transaction();
    // The used bytes in the main region
    per->used_size = (uint8_t*)(&per->used_size) - ((uint8_t*)base_addr+sizeof(PersistentHeader))+128;
    flush_range((uint8_t*)per,sizeof(PersistentHeader));
    PFENCE();
    // Finally, set the id to confirm that the whole initialization process has completed
    per->id = MAGIC_ID;
    PWB(&per->id);
    PSYNC();
}

void RomulusLog::ns_reset(){
    per->id = MAGIC_ID;
    PWB(&per->id);
    PFENCE();
    std::memset(base_addr,0,max_size);

    // No data in persistent memory, initialize
    per = new (base_addr) PersistentHeader;
    g_main_size = (max_size - sizeof(PersistentHeader))/2;
    main_addr = base_addr + sizeof(PersistentHeader);
    back_addr = main_addr + g_main_size;
    PWB(&per->id);
    PWB(&per->state);
    // We need to call create_mspace_with_base() from within a transaction so that
    // the modifications on 'main' get replicated on 'back'. This means we temporarily
    // need to set the 'used_size' to 'main_size' to make sure everything is copied.
    begin_transaction();
    // Just to force the copy of the whole main region
    per->used_size = g_main_size;
#ifdef USE_ESLOCO
    esloco = new EsLoco<persist>(main_addr, g_main_size, false);
    per->objects = (void**)esloco->malloc(sizeof(void*)*100);
#else
    per->ms = create_mspace_with_base(main_addr, g_main_size, false);
    per->objects = (void**)mspace_malloc(per->ms, sizeof(void*)*100);
#endif
    for (int i = 0; i < 100; i++) {
        per->objects[i] = nullptr;
        add_to_log(&per->objects[i],sizeof(void*));
        PWB(&per->objects[i]);
    }
    end_transaction();
    // The used bytes in the main region
    per->used_size = (uint8_t*)(&per->used_size) - ((uint8_t*)base_addr+sizeof(PersistentHeader))+128;
    flush_range((uint8_t*)per,sizeof(PersistentHeader));
    PFENCE();
    // Finally, set the id to confirm that the whole initialization process has completed
    per->id = MAGIC_ID;
    PWB(&per->id);
    PSYNC();
}

void RomulusLog::reset(){
    gRomLog.ns_reset();
}

/*
 * Recovers from an incomplete transaction if needed
 */
inline void RomulusLog::recover() {
    int lstate = per->state.load(std::memory_order_relaxed);
    if (lstate == IDLE) {
        return;
    } else if (lstate == COPYING) {
        printf("RomulusLog: Recovery from COPYING...\n");
        copyMainToBack();
    } else if (lstate == MUTATING) {
        printf("RomulusLog: Recovery from MUTATING...\n");
        copyBackToMain();
    } else {
        assert(false);
        // ERROR: corrupted state
    }
    PFENCE();
    per->state.store(IDLE, std::memory_order_relaxed);
    return;
}


/*
 * Meant to be called from user code when something bad happens and the
 * whole transaction needs to be aborted.
 * TODO: fix this for nested transactions.
 */
inline void RomulusLog::abort_transaction(void) {
    // Check for nested transaction
    --tl_nested_write_trans;
    if (tl_nested_write_trans != 0) return;
    // Apply the log to rollback the modifications
    apply_log(back_addr, main_addr);

}

}


================================================
FILE: ptms/romuluslog/RomulusLog.hpp
================================================
#ifndef _ROMULUS_LOG_PERSISTENT_TRANSACTIONAL_MEMORY_
#define _ROMULUS_LOG_PERSISTENT_TRANSACTIONAL_MEMORY_
#include <atomic>
#include <cstdint>
#include <cassert>
#include <string>
#include <cstring>      // std::memcpy()
#include <sys/mman.h>   // Needed if we use mmap()
#include <sys/types.h>  // Needed by open() and close()
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>     // Needed by close()
#include <linux/mman.h> // Needed by MAP_SHARED_VALIDATE
#include <stdio.h>
#include <functional>

#include "../../common/pfences.h"
#include "ptms/rwlocks/CRWWP_SpinLock.hpp"
#include "common/ThreadRegistry.hpp"

// Size of the persistent memory region
#ifndef PM_REGION_SIZE
#define PM_REGION_SIZE (400*1024*1024ULL) // 400 MB by default (to run on laptop)
#endif
// DAX flag (MAP_SYNC) is needed for Optane but not for /dev/shm/
#ifdef PM_USE_DAX
#define PM_FLAGS       MAP_SYNC
#else
#define PM_FLAGS       0
#endif
// Name of persistent file mapping
#ifndef PM_FILE_NAME
#define PM_FILE_NAME   "/dev/shm/romulus_log_shared"
#endif

namespace romuluslog {

// Forward declaration of RomulusLog to create a global instance
class RomulusLog;
extern RomulusLog gRomLog;

extern uint64_t g_main_size;
extern uint8_t* g_main_addr;

// Counter of nested write transactions
extern thread_local int64_t tl_nested_write_trans;
// Counter of nested read-only transactions
extern thread_local int64_t tl_nested_read_trans;
extern bool histoOn;
extern bool histoflag;


#ifdef USE_ESLOCO
#include "EsLoco/EsLoco.hpp"
// forward declaration
template<typename T> struct persist;
#else
typedef void* mspace;
extern void* mspace_malloc(mspace msp, size_t bytes);
extern void mspace_free(mspace msp, void* mem);
#endif

/*
 * <h1> Romulus Log </h1>
 * TODO: explain this...
 *
 *
 *
 */
class RomulusLog {
    // Id for sanity check of Romulus
    static const uint64_t MAGIC_ID = 0x1337BAB2;

    // Possible values for "state"
    static const int IDLE = 0;
    static const int MUTATING = 1;
    static const int COPYING = 2;

    // Number of log entries in a chunk of the log
    static const int CHUNK_SIZE = 1024;

    // Member variables
    const char* MMAP_FILENAME = PM_FILE_NAME;
    bool dommap;
    int fd = -1;
    uint8_t* base_addr;
    uint64_t max_size;
    uint8_t* main_addr;
    uint8_t* back_addr;
    CRWWPSpinLock rwlock {};

    // Stuff used by the Flat Combining mechanism
    static const int CLPAD = 128/sizeof(uintptr_t);
    alignas(128) std::atomic< std::function<void()>* >* fc; // array of atomic pointers to functions
    const int maxThreads;

    // Each log entry is two words (8+8 = 16 bytes)
    struct LogEntry {
        size_t    offset;  // Pointer offset in bytes, relative to main_addr
        uint64_t  length;  // Range length of data at pointer offset
    };

    struct LogChunk {
        LogEntry  entries[CHUNK_SIZE];
        uint64_t  num_entries { 0 };
        LogChunk* next        { nullptr };
    };

    // There is always at least one (empty) chunk in the log, it's the head
    LogChunk* log_head = new LogChunk;
    LogChunk* log_tail = log_head;

    // One instance of this is at the start of base_addr, in persistent memory
    struct PersistentHeader {
        uint64_t         id {0};          // Validates intialization
        std::atomic<int> state {IDLE};    // Current state of consistency
        void**           objects {};      // Objects directory
#ifdef USE_ESLOCO
#else
        mspace           ms {};           // Pointer to allocator's metadata
#endif
        uint64_t         used_size {0};   // It has to be the last, to calculate the used_size
    };

    PersistentHeader* per {nullptr};      // Volatile pointer to start of persistent memory
    uint64_t log_size = 0;
    bool logEnabled = true;

#ifdef USE_ESLOCO
    EsLoco<persist> *esloco {nullptr};
#endif

    //
    // Private methods
    //

    // Copy the data from 'main' to 'back'
    void copyMainToBack();

    // Copy the data from 'back' to 'main'
    void copyBackToMain();

public:

    int* histo  = new int[300]; // array of atomic pointers to functions
    int storecount = 0;
    // Flush touched cache lines
    inline static void flush_range(uint8_t* addr, size_t length) noexcept {
        const int cache_line_size = 64;
        uint8_t* ptr = addr;
        uint8_t* last = addr + length;
        for (; ptr < last; ptr += cache_line_size) PWB(ptr);
    }


private:

    /*
     * Called to make every store persistent on main and back region
     */
    inline void apply_pwb(uint8_t* from_addr) {
        // Apply the log to the instance on 'to_addr', copying data from the instance at 'from_addr'
        LogChunk* chunk = log_head;
        while (chunk != nullptr) {
            for (int i = 0; i < chunk->num_entries; i++) {
                LogEntry& e = chunk->entries[i];
                //std::memcpy(to_addr + e.offset, from_addr + e.offset, e.length);
                flush_range(from_addr + e.offset, e.length);

            }
            chunk = chunk->next;
        }
    }

    /*
     * Called at the end of a transaction to replicate the mutations on "back",
     * or when abort_transaction() is called by the user, to rollback the
     * mutations on "main".
     * Deletes the log as it is being applied.
     */
    inline void apply_log(uint8_t* from_addr, uint8_t* to_addr) {
        // Apply the log to the instance on 'to_addr', copying data from the instance at 'from_addr'
        LogChunk* chunk = log_head;
        while (chunk != nullptr) {
            for (int i = 0; i < chunk->num_entries; i++) {
                LogEntry& e = chunk->entries[i];
                //printf("entry %i of %d from addr %p to addr %p  offset=%ld\n", i, chunk->num_entries, from_addr + e.offset, to_addr + e.offset, e.offset);
                std::memcpy(to_addr + e.offset, from_addr + e.offset, e.length);
            }
            chunk = chunk->next;
        }
    }

    inline void clear_log() {
        LogChunk* chunk = log_head->next;
        while (chunk != nullptr) {
            LogChunk* next = chunk->next;
            delete chunk;
            chunk = next;
        }
        // Clear the log, leaving one chunk for next transaction, with zero'ed entries
        log_tail = log_head;
        log_head->num_entries = 0;
        log_head->next = nullptr;
    }


public:

    /*
        * Adds to the log the current contents of the memory location starting at
        * 'addr' with a certain 'length' in bytes
        */
       inline void add_to_log(void* addr, int length) noexcept {
           if (!logEnabled) return;
           // If the log has more than 1/4 of the entire size then skip the log
           // and copy the used size of the main region.
           if (log_size > per->used_size/4) {
               logEnabled = false;
               return;
           }

           size_t addrCL = ((size_t)addr)>>6;
           // Get the current chunk of log and if it is already full then create a new chunk and add the entry there.
           LogChunk* chunk = log_tail;

           bool sameCL = false;

           if(addrCL == (size_t)((uint8_t*)addr+length)>>6){
        	   sameCL = true;
        	   int size = chunk->num_entries;
        	   for(int i=size-1;i>=0 && i>size-16;i--){
        		   LogEntry& e1 = chunk->entries[i];

        		   size_t offCL = (size_t)(e1.offset+main_addr)>>6;
        		   if(e1.length==64 && (size_t)(offCL<<6) == (size_t)(e1.offset+main_addr)){
        			   if(offCL == addrCL) return;
        		   }
        	   }
           }
           if (chunk->num_entries == CHUNK_SIZE) {
               chunk = new LogChunk();
               log_tail->next = chunk;
               log_tail = chunk;
           }
           LogEntry& e = chunk->entries[chunk->num_entries];
           if(histoOn) gRomLog.storecount+=2;
           if(sameCL){
        	   size_t cl =addrCL<<6;
               e.offset = (uint8_t*)cl - main_addr;
               e.length = 64;
           }else {
        	   e.offset = (uint8_t*)addr - main_addr;
        	   e.length = length;
           }
           log_size += length;
           chunk->num_entries++;
       }


    RomulusLog();

    ~RomulusLog();

    static std::string className() { return "RomulusLog"; }


    template <typename T>
    static inline T* get_object(int idx) {
        // Equivalent to persist<void*>.pload()
        return (T*)gRomLog.per->objects[idx];
    }

    template <typename T>
    static inline void put_object(int idx, T* obj) {
        // Equivalent to persist<void*>.pstore()
        gRomLog.add_to_log(&gRomLog.per->objects[idx],sizeof(T*));
        gRomLog.per->objects[idx] = obj;
        PWB(&gRomLog.per->objects[idx]);
    }

    void createFile();

    void ns_reset();
    static void reset();

    /*
     * Must be called at the beginning of each (write) transaction.
     * This function has strict semantics.
     */
    inline void begin_transaction() {
        // Check for nested transaction
        tl_nested_write_trans++;
        if (tl_nested_write_trans != 1) return;
        per->state.store(MUTATING, std::memory_order_relaxed);
        PWB(&per->state);
        // One PFENCE() is enough for all user modifications because no ordering is needed between them.
        PFENCE();
    }


    /*
     * Must be called at the end of each (write) transaction.
     * This function has strict semantics.
     */
    inline void end_transaction() {
        // Check for nested transaction
        --tl_nested_write_trans;
        if (tl_nested_write_trans != 0) return;
        // Do a PFENCE() to make persistent the stores done in 'main' and on
        // the Romulus persistent data (due to memory allocation). We only care
        // about ordering here, not durability, therefore, no need to block.
        apply_pwb(main_addr);
        PFENCE();
        per->state.store(COPYING, std::memory_order_relaxed);
        PWB(&per->state);
        PWB(&per->used_size);
        // PSYNC() here to have ACID Durability on the mutations done to 'main'
        // and make the change of state visible.
        PSYNC();
        // Apply log, copying data from 'main' to 'back'
        if (logEnabled) {
            apply_log(main_addr, back_addr);
            apply_pwb(back_addr);
        } else {
            copyMainToBack();
            logEnabled = true;
        }
        clear_log();
        log_size = 0;
        PFENCE();
        per->state.store(IDLE, std::memory_order_relaxed);
    }


    bool compareMainAndBack() {
        if (std::memcmp(main_addr, back_addr, g_main_size) != 0) {
            void* firstaddr = nullptr;
            int sumdiff = 0;
            for (size_t idx = 0; idx < g_main_size-sizeof(size_t); idx++) {
                if (*(main_addr+idx) != *(back_addr+idx)) {
                    printf("Difference at %p  main=%ld  back=%ld\n", main_addr+idx, *(int64_t*)(main_addr+idx), *(int64_t*)(back_addr+idx));
                    sumdiff++;
                    if (firstaddr == nullptr) firstaddr = main_addr+idx;
                }
            }
            if (sumdiff != 0) {
                printf("sumdiff=%d bytes\n", sumdiff);
                printf("\nThere seems to be a missing persist<T> in your code.\n");
                printf("Rerun with gdb and set a watchpoint using the command\nwatch * %p\n\n", firstaddr);
            }
            assert(sumdiff == 0);
        }
        return true;
    }


    /*
     * Recovers from an incomplete transaction if needed
     */
    inline void recover();


    /*
     * Meant to be called from user code when something bad happens and the
     * whole transaction needs to be aborted.
     */
    inline void abort_transaction(void);


    // Same as begin/end transaction, but with a lambda.
    // Calling abort_transaction() from within the lambda is not allowed.
    template<typename R, class F>
    R transaction(F&& func) {
        begin_transaction();
        R retval = func();
        end_transaction();
        return retval;
    }

    template<class F>
    static void transaction(F&& func) {
        gRomLog.begin_transaction();
        func();
        gRomLog.end_transaction();
    }


    /*
     * Non static, thread-safe
     * Progress: Blocking (starvation-free)
     */
    template<class Func>
    void ns_write_transaction(Func&& mutativeFunc) {
        if (tl_nested_write_trans > 0) {
            mutativeFunc();
            return;
        }
        std::function<void()> myfunc = mutativeFunc;
        int tid = ThreadRegistry::getTID();
        // Add our mutation to the array of flat combining
        fc[tid*CLPAD].store(&myfunc, std::memory_order_release);
        // Lock writersMutex
        while (true) {
            if (rwlock.tryExclusiveLock()) break;
            // Check if another thread executed my mutation
            if (fc[tid*CLPAD].load(std::memory_order_acquire) == nullptr) return;
            std::this_thread::yield();
        }

        bool somethingToDo = false;
        const int maxTid = ThreadRegistry::getMaxThreads();
        // Save a local copy of the flat combining array
        std::function<void()>* lfc[maxTid];
        for (int i = 0; i < maxTid; i++) {
            lfc[i] = fc[i*CLPAD].load(std::memory_order_acquire);
            if (lfc[i] != nullptr) somethingToDo = true;
        }
        // Check if there is at least one operation to apply
        if (!somethingToDo) {
            rwlock.exclusiveUnlock();
            return;
        }

        if(histoflag) storecount=0;
        per->state.store(MUTATING, std::memory_order_relaxed);
        PWB(&per->state);
        // One PFENCE() is enough for all user modifications because no ordering is needed between them.
        PFENCE();
        rwlock.waitForReaders();

        ++tl_nested_write_trans;
        // Apply all mutativeFunc
        for (int i = 0; i < maxTid; i++) {
            if (lfc[i] == nullptr) continue;
            (*lfc[i])();
        }
        apply_pwb(main_addr);
        PFENCE();
        per->state.store(COPYING, std::memory_order_relaxed);
        PWB(&per->state);
        // PSYNC() here to have ACID Durability on the mutations done to 'main' and make the change of state visible
        PSYNC();
        // After changing changing state to COPYING all applied mutativeFunc are visible and persisted
        for (int i = 0; i < maxTid; i++) {
            if (lfc[i] == nullptr) continue;
            fc[i*CLPAD].store(nullptr, std::memory_order_release);
        }
        // Apply log, copying data from 'main' to 'back'
        if (logEnabled) {
            apply_log(main_addr, back_addr);
            apply_pwb(back_addr);
        } else {
            copyMainToBack();
            logEnabled = true;
        }
        clear_log();
        log_size = 0;

        PFENCE();
        per->state.store(IDLE, std::memory_order_relaxed);
        if(histoflag) histo[storecount]++;
        rwlock.exclusiveUnlock();
        --tl_nested_write_trans;
        //consistency_check();
    }

    // Non-static thread-safe read-only transaction
    template<class Func>
    void ns_read_transaction(Func&& readFunc) {
        if (tl_nested_read_trans > 0) {
            readFunc();
            return;
        }
        int tid = ThreadRegistry::getTID();
        ++tl_nested_read_trans;
        rwlock.sharedLock(tid);
        readFunc();
        rwlock.sharedUnlock(tid);
        --tl_nested_read_trans;
    }

    // static thread-safe read-only transaction
    static void begin_read_transaction() {
        int tid = ThreadRegistry::getTID();
        gRomLog.rwlock.sharedLock(tid);
    }

    // static thread-safe read-only transaction
    static void end_read_transaction() {
        int tid = ThreadRegistry::getTID();
        gRomLog.rwlock.sharedUnlock(tid);
    }

    /*
     * Allocator
     * This method calls Doug Lea's allocator to get a piece of persistent
     * memory large enough to hold T, and then calls the constructor of T for
     * that memory region.
     */
    template <typename T, typename... Args>
    static T* tmNew(Args&&... args) {
    	//if(histoflag) histoOn = true;
        const RomulusLog& r = gRomLog;
#ifdef USE_ESLOCO
        void* addr = r.esloco->malloc(sizeof(T));
        assert(addr != nullptr);
#else
        void* addr = mspace_malloc(r.per->ms, sizeof(T));
        assert(addr != 0);
#endif
        T* ptr = new (addr) T(std::forward<Args>(args)...); // placement new
        if (r.per->used_size < (uint8_t*)addr - r.main_addr + sizeof(T)+128) {
            r.per->used_size = (uint8_t*)addr - r.main_addr + sizeof(T)+128;
            PWB(&r.per->used_size);
        }
        //if(histoflag) histoOn = false;
        return ptr;
    }


    /*
     * De-allocator
     * Calls destructor of T and then reclaims the memory using Doug Lea's free
     */
    template<typename T>
    static void tmDelete(T* obj) {
        if (obj == nullptr) return;
        //if(histoflag) histoOn =true;
        obj->~T();
#ifdef USE_ESLOCO
        gRomLog.esloco->free(obj);
#else
        mspace_free(gRomLog.per->ms,obj);
#endif
        //if(histoflag) histoOn =false;
    }


    /* Allocator for C methods (like memcached) */
    static void* pmalloc(size_t size) {
    	//if(histoflag) histoOn =true;
        const RomulusLog& r = gRomLog;
#ifdef USE_ESLOCO
        void* addr = r.esloco->malloc(size);
        assert(addr != nullptr);
#else
        void* addr = mspace_malloc(r.per->ms, size);
        assert(addr != 0);
#endif
        if (r.per->used_size < (uint8_t*)addr - r.main_addr + size + 128) {
            r.per->used_size = (uint8_t*)addr - r.main_addr + size + 128;
            PWB(&r.per->used_size);
        }
        //if(histoflag) histoOn =false;
        return addr;
    }


    /* De-allocator for C methods (like memcached) */
    static void pfree(void* ptr) {
    	//if(histoflag) histoOn =true;
#ifdef USE_ESLOCO
        gRomLog.esloco->free(ptr);
#else
        mspace_free(gRomLog.per->ms,ptr);
#endif
    	//if(histoflag) histoOn =false;
    }

    template<class F>
    inline static void readTx(F&& func) {
        gRomLog.ns_read_transaction(func);
    }

    template<class F>
    inline static void updateTx(F&& func) {
        gRomLog.ns_write_transaction(func);
    }


    // TODO: Remove these two once we make CX have void transactions
    template<typename R,class F>
    inline static R readTx(F&& func) {
        gRomLog.ns_read_transaction([&]() {func();});
        return R{};
    }
    template<typename R,class F>
    inline static R updateTx(F&& func) {
        gRomLog.ns_write_transaction([&]() {func();});
        return R{};
    }


/*
     * Thread-safe. Compares the contents of 'main' and 'back'.
     * This method MUST be called outside a transaction.
     */
    static bool consistency_check(void) {
        if (tl_nested_write_trans > 0) {
            printf("Warning: don't call consistency_check() inside a transaction\n");
        } else {
            while (!gRomLog.rwlock.tryExclusiveLock()) std::this_thread::yield();
            gRomLog.compareMainAndBack();
            gRomLog.rwlock.exclusiveUnlock();
        }
        return true;
    }
};


/*
 * Definition of persist<> type for RomulusLog.
 * In RomulusLog we need to interpose the stores and add them to the log, see pstore().
 */
template<typename T>
struct persist {
    // Stores the actual value
    T val;

    persist() { }

    persist(T initVal) {
        pstore(initVal);
    }

    // Casting operator
    operator T() {
        return pload();
    }

    // Prefix increment operator: ++x
    void operator++ () {
        pstore(pload()+1);
    }

    // Prefix decrement operator: --x
    void operator-- () {
        pstore(pload()-1);
    }

    void operator++ (int) {
        pstore(pload()+1);
    }

    void operator-- (int) {
        pstore(pload()-1);
    }

    // Equals operator: first downcast to T and then compare
    bool operator == (const T& otherval) const {
        return pload() == otherval;
    }

    // Difference operator: first downcast to T and then compare
    bool operator != (const T& otherval) const {
        return pload() != otherval;
    }

    // Relational operators
    bool operator < (const T& rhs) {
        return pload() < rhs;
    }
    bool operator > (const T& rhs) {
        return pload() > rhs;
    }
    bool operator <= (const T& rhs) {
        return pload() <= rhs;
    }
    bool operator >= (const T& rhs) {
        return pload() >= rhs;
    }

    T operator % (const T& rhs) {
        return pload() % rhs;
    }

    // Operator arrow ->
    T operator->() {
        return pload();
    }

    // Operator &
    T* operator&() {
        return &val;
    }

    // Copy constructor
    persist<T>(const persist<T>& other) {
        pstore(other.pload());
    }

    // Assignment operator from an atomic_mwc
    persist<T>& operator=(const persist<T>& other) {
        pstore(other.pload());
        return *this;
    }

    // Assignment operator from a value
    persist<T>& operator=(T value) {
        pstore(value);
        return *this;
    }

    persist<T>& operator&=(T value) {
        pstore(pload() & value);
        return *this;
    }

    persist<T>& operator|=(T value) {
        pstore(pload() | value);
        return *this;
    }
    persist<T>& operator+=(T value) {
        pstore(pload() + value);
        return *this;
    }
    persist<T>& operator-=(T value) {
        pstore(pload() - value);
        return *this;
    }

    // Implementation is after RomulusLog class
    inline void pstore(T newVal) {
        val = newVal;
        const uint8_t* valaddr = (uint8_t*)&val;
        if (valaddr >= g_main_addr && valaddr < g_main_addr+g_main_size) {
            //PWB(&val);
            gRomLog.add_to_log(&val,sizeof(T));
        }
    }

    inline T pload() const {
        return val;
    }
};

} // end of romuluslog namespace
#endif  // _ROMULUS_LOG_PERSISTENT_TRANSACTIONAL_MEMORY_


================================================
FILE: ptms/romuluslog/malloc.cpp
================================================
/*
  This is a version (aka dlmalloc) of malloc/free/realloc written by
  Doug Lea and released to the public domain, as explained at
  http://creativecommons.org/publicdomain/zero/1.0/ Send questions,
  comments, complaints, performance data, etc to dl@cs.oswego.edu

* Version 2.8.6 Wed Aug 29 06:57:58 2012  Doug Lea
   Note: There may be an updated version of this malloc obtainable at
           ftp://gee.cs.oswego.edu/pub/misc/malloc.c
         Check before installing!

* Quickstart

  This library is all in one file to simplify the most common usage:
  ftp it, compile it (-O3), and link it into another program. All of
  the compile-time options default to reasonable values for use on
  most platforms.  You might later want to step through various
  compile-time and dynamic tuning options.

  For convenience, an include file for code using this malloc is at:
     ftp://gee.cs.oswego.edu/pub/misc/malloc-2.8.6.h
  You don't really need this .h file unless you call functions not
  defined in your system include files.  The .h file contains only the
  excerpts from this file needed for using this malloc on ANSI C/C++
  systems, so long as you haven't changed compile-time options about
  naming and tuning parameters.  If you do, then you can create your
  own malloc.h that does include all settings by cutting at the point
  indicated below. Note that you may already by default be using a C
  library containing a malloc that is based on some version of this
  malloc (for example in linux). You might still want to use the one
  in this file to customize settings or to avoid overheads associated
  with library versions.

* Vital statistics:

  Supported pointer/size_t representation:       4 or 8 bytes
       size_t MUST be an unsigned type of the same width as
       pointers. (If you are using an ancient system that declares
       size_t as a signed type, or need it to be a different width
       than pointers, you can use a previous release of this malloc
       (e.g. 2.7.2) supporting these.)

  Alignment:                                     8 bytes (minimum)
       This suffices for nearly all current machines and C compilers.
       However, you can define MALLOC_ALIGNMENT to be wider than this
       if necessary (up to 128bytes), at the expense of using more space.

  Minimum overhead per allocated chunk:   4 or  8 bytes (if 4byte sizes)
                                          8 or 16 bytes (if 8byte sizes)
       Each malloced chunk has a hidden word of overhead holding size
       and status information, and additional cross-check word
       if FOOTERS is defined.

  Minimum allocated size: 4-byte ptrs:  16 bytes    (including overhead)
                          8-byte ptrs:  32 bytes    (including overhead)

       Even a request for zero bytes (i.e., malloc(0)) returns a
       pointer to something of the minimum allocatable size.
       The maximum overhead wastage (i.e., number of extra bytes
       allocated than were requested in malloc) is less than or equal
       to the minimum size, except for requests >= mmap_threshold that
       are serviced via mmap(), where the worst case wastage is about
       32 bytes plus the remainder from a system page (the minimal
       mmap unit); typically 4096 or 8192 bytes.

  Security: static-safe; optionally more or less
       The "security" of malloc refers to the ability of malicious
       code to accentuate the effects of errors (for example, freeing
       space that is not currently malloc'ed or overwriting past the
       ends of chunks) in code that calls malloc.  This malloc
       guarantees not to modify any memory locations below the base of
       heap, i.e., static variables, even in the presence of usage
       errors.  The routines additionally detect most improper frees
       and reallocs.  All this holds as long as the static bookkeeping
       for malloc itself is not corrupted by some other means.  This
       is only one aspect of security -- these checks do not, and
       cannot, detect all possible programming errors.

       If FOOTERS is defined nonzero, then each allocated chunk
       carries an additional check word to verify that it was malloced
       from its space.  These check words are the same within each
       execution of a program using malloc, but differ across
       executions, so externally crafted fake chunks cannot be
       freed. This improves security by rejecting frees/reallocs that
       could corrupt heap memory, in addition to the checks preventing
       writes to statics that are always on.  This may further improve
       security at the expense of time and space overhead.  (Note that
       FOOTERS may also be worth using with MSPACES.)

       By default detected errors cause the program to abort (calling
       "abort()"). You can override this to instead proceed past
       errors by defining PROCEED_ON_ERROR.  In this case, a bad free
       has no effect, and a malloc that encounters a bad address
       caused by user overwrites will ignore the bad address by
       dropping pointers and indices to all known memory. This may
       be appropriate for programs that should continue if at all
       possible in the face of programming errors, although they may
       run out of memory because dropped memory is never reclaimed.

       If you don't like either of these options, you can define
       CORRUPTION_ERROR_ACTION and USAGE_ERROR_ACTION to do anything
       else. And if if you are sure that your program using malloc has
       no errors or vulnerabilities, you can define INSECURE to 1,
       which might (or might not) provide a small performance improvement.

       It is also possible to limit the maximum total allocatable
       space, using malloc_set_footprint_limit. This is not
       designed as a security feature in itself (calls to set limits
       are not screened or privileged), but may be useful as one
       aspect of a secure implementation.

  Thread-safety: NOT thread-safe unless USE_LOCKS defined non-zero
       When USE_LOCKS is defined, each public call to malloc, free,
       etc is surrounded with a lock. By default, this uses a plain
       pthread mutex, win32 critical section, or a spin-lock if if
       available for the platform and not disabled by setting
       USE_SPIN_LOCKS=0.  However, if USE_RECURSIVE_LOCKS is defined,
       recursive versions are used instead (which are not required for
       base functionality but may be needed in layered extensions).
       Using a global lock is not especially fast, and can be a major
       bottleneck.  It is designed only to provide minimal protection
       in concurrent environments, and to provide a basis for
       extensions.  If you are using malloc in a concurrent program,
       consider instead using nedmalloc
       (http://www.nedprod.com/programs/portable/nedmalloc/) or
       ptmalloc (See http://www.malloc.de), which are derived from
       versions of this malloc.

  System requirements: Any combination of MORECORE and/or MMAP/MUNMAP
       This malloc can use unix sbrk or any emulation (invoked using
       the CALL_MORECORE macro) and/or mmap/munmap or any emulation
       (invoked using CALL_MMAP/CALL_MUNMAP) to get and release system
       memory.  On most unix systems, it tends to work best if both
       MORECORE and MMAP are enabled.  On Win32, it uses emulations
       based on VirtualAlloc. It also uses common C library functions
       like memset.

  Compliance: I believe it is compliant with the Single Unix Specification
       (See http://www.unix.org). Also SVID/XPG, ANSI C, and probably
       others as well.

* Overview of algorithms

  This is not the fastest, most space-conserving, most portable, or
  most tunable malloc ever written. However it is among the fastest
  while also being among the most space-conserving, portable and
  tunable.  Consistent balance across these factors results in a good
  general-purpose allocator for malloc-intensive programs.

  In most ways, this malloc is a best-fit allocator. Generally, it
  chooses the best-fitting existing chunk for a request, with ties
  broken in approximately least-recently-used order. (This strategy
  normally maintains low fragmentation.) However, for requests less
  than 256bytes, it deviates from best-fit when there is not an
  exactly fitting available chunk by preferring to use space adjacent
  to that used for the previous small request, as well as by breaking
  ties in approximately most-recently-used order. (These enhance
  locality of series of small allocations.)  And for very large requests
  (>= 256Kb by default), it relies on system memory mapping
  facilities, if supported.  (This helps avoid carrying around and
  possibly fragmenting memory used only for large chunks.)

  All operations (except malloc_stats and mallinfo) have execution
  times that are bounded by a constant factor of the number of bits in
  a size_t, not counting any clearing in calloc or copying in realloc,
  or actions surrounding MORECORE and MMAP that have times
  proportional to the number of non-contiguous regions returned by
  system allocation routines, which is often just 1. In real-time
  applications, you can optionally suppress segment traversals using
  NO_SEGMENT_TRAVERSAL, which assures bounded execution even when
  system allocators return non-contiguous spaces, at the typical
  expense of carrying around more memory and increased fragmentation.

  The implementation is not very modular and seriously overuses
  macros. Perhaps someday all C compilers will do as good a job
  inlining modular code as can now be done by brute-force expansion,
  but now, enough of them seem not to.

  Some compilers issue a lot of warnings about code that is
  dead/unreachable only on some platforms, and also about intentional
  uses of negation on unsigned types. All known cases of each can be
  ignored.

  For a longer but out of date high-level description, see
     http://gee.cs.oswego.edu/dl/html/malloc.html

* MSPACES
  If MSPACES is defined, then in addition to malloc, free, etc.,
  this file also defines mspace_malloc, mspace_free, etc. These
  are versions of malloc routines that take an "mspace" argument
  obtained using create_mspace, to control all internal bookkeeping.
  If ONLY_MSPACES is defined, only these versions are compiled.
  So if you would like to use this allocator for only some allocations,
  and your system malloc for others, you can compile with
  ONLY_MSPACES and then do something like...
    static mspace mymspace = create_mspace(0,0); // for example
    #define mymalloc(bytes)  mspace_malloc(mymspace, bytes)

  (Note: If you only need one instance of an mspace, you can instead
  use "USE_DL_PREFIX" to relabel the global malloc.)

  You can similarly create thread-local allocators by storing
  mspaces as thread-locals. For example:
    static __thread mspace tlms = 0;
    void*  tlmalloc(size_t bytes) {
      if (tlms == 0) tlms = create_mspace(0, 0);
      return mspace_malloc(tlms, bytes);
    }
    void  tlfree(void* mem) { mspace_free(tlms, mem); }

  Unless FOOTERS is defined, each mspace is completely independent.
  You cannot allocate from one and free to another (although
  conformance is only weakly checked, so usage errors are not always
  caught). If FOOTERS is defined, then each chunk carries around a tag
  indicating its originating mspace, and frees are directed to their
  originating spaces. Normally, this requires use of locks.

 -------------------------  Compile-time options ---------------------------

Be careful in setting #define values for numerical constants of type
size_t. On some systems, literal values are not automatically extended
to size_t precision unless they are explicitly casted. You can also
use the symbolic values MAX_SIZE_T, SIZE_T_ONE, etc below.

WIN32                    default: defined if _WIN32 defined
  Defining WIN32 sets up defaults for MS environment and compilers.
  Otherwise defaults are for unix. Beware that there seem to be some
  cases where this malloc might not be a pure drop-in replacement for
  Win32 malloc: Random-looking failures from Win32 GDI API's (eg;
  SetDIBits()) may be due to bugs in some video driver implementations
  when pixel buffers are malloc()ed, and the region spans more than
  one VirtualAlloc()ed region. Because dlmalloc uses a small (64Kb)
  default granularity, pixel buffers may straddle virtual allocation
  regions more often than when using the Microsoft allocator.  You can
  avoid this by using VirtualAlloc() and VirtualFree() for all pixel
  buffers rather than using malloc().  If this is not possible,
  recompile this malloc with a larger DEFAULT_GRANULARITY. Note:
  in cases where MSC and gcc (cygwin) are known to differ on WIN32,
  conditions use _MSC_VER to distinguish them.

DLMALLOC_EXPORT       default: extern
  Defines how public APIs are declared. If you want to export via a
  Windows DLL, you might define this as
    #define DLMALLOC_EXPORT extern  __declspec(dllexport)
  If you want a POSIX ELF shared object, you might use
    #define DLMALLOC_EXPORT extern __attribute__((visibility("default")))

MALLOC_ALIGNMENT         default: (size_t)(2 * sizeof(void *))
  Controls the minimum alignment for malloc'ed chunks.  It must be a
  power of two and at least 8, even on machines for which smaller
  alignments would suffice. It may be defined as larger than this
  though. Note however that code and data structures are optimized for
  the case of 8-byte alignment.

MSPACES                  default: 0 (false)
  If true, compile in support for independent allocation spaces.
  This is only supported if HAVE_MMAP is true.

ONLY_MSPACES             default: 0 (false)
  If true, only compile in mspace versions, not regular versions.

USE_LOCKS                default: 0 (false)
  Causes each call to each public routine to be surrounded with
  pthread or WIN32 mutex lock/unlock. (If set true, this can be
  overridden on a per-mspace basis for mspace versions.) If set to a
  non-zero value other than 1, locks are used, but their
  implementation is left out, so lock functions must be supplied manually,
  as described below.

USE_SPIN_LOCKS           default: 1 iff USE_LOCKS and spin locks available
  If true, uses custom spin locks for locking. This is currently
  supported only gcc >= 4.1, older gccs on x86 platforms, and recent
  MS compilers.  Otherwise, posix locks or win32 critical sections are
  used.

USE_RECURSIVE_LOCKS      default: not defined
  If defined nonzero, uses recursive (aka reentrant) locks, otherwise
  uses plain mutexes. This is not required for malloc proper, but may
  be needed for layered allocators such as nedmalloc.

LOCK_AT_FORK            default: not defined
  If defined nonzero, performs pthread_atfork upon initialization
  to initialize child lock while holding parent lock. The implementation
  assumes that pthread locks (not custom locks) are being used. In other
  cases, you may need to customize the implementation.

FOOTERS                  default: 0
  If true, provide extra checking and dispatching by placing
  information in the footers of allocated chunks. This adds
  space and time overhead.

INSECURE                 default: 0
  If true, omit checks for usage errors and heap space overwrites.

USE_DL_PREFIX            default: NOT defined
  Causes compiler to prefix all public routines with the string 'dl'.
  This can be useful when you only want to use this malloc in one part
  of a program, using your regular system malloc elsewhere.

MALLOC_INSPECT_ALL       default: NOT defined
  If defined, compiles malloc_inspect_all and mspace_inspect_all, that
  perform traversal of all heap space.  Unless access to these
  functions is otherwise restricted, you probably do not want to
  include them in secure implementations.

ABORT                    default: defined as abort()
  Defines how to abort on failed checks.  On most systems, a failed
  check cannot die with an "assert" or even print an informative
  message, because the underlying print routines in turn call malloc,
  which will fail again.  Generally, the best policy is to simply call
  abort(). It's not very useful to do more than this because many
  errors due to overwriting will show up as address faults (null, odd
  addresses etc) rather than malloc-triggered checks, so will also
  abort.  Also, most compilers know that abort() does not return, so
  can better optimize code conditionally calling it.

PROCEED_ON_ERROR           default: defined as 0 (false)
  Controls whether detected bad addresses cause them to bypassed
  rather than aborting. If set, detected bad arguments to free and
  realloc are ignored. And all bookkeeping information is zeroed out
  upon a detected overwrite of freed heap space, thus losing the
  ability to ever return it from malloc again, but enabling the
  application to proceed. If PROCEED_ON_ERROR is defined, the
  static variable malloc_corruption_error_count is compiled in
  and can be examined to see if errors have occurred. This option
  generates slower code than the default abort policy.

DEBUG                    default: NOT defined
  The DEBUG setting is mainly intended for people trying to modify
  this code or diagnose problems when porting to new platforms.
  However, it may also be able to better isolate user errors than just
  using runtime checks.  The assertions in the check routines spell
  out in more detail the assumptions and invariants underlying the
  algorithms.  The checking is fairly extensive, and will slow down
  execution noticeably. Calling malloc_stats or mallinfo with DEBUG
  set will attempt to check every non-mmapped allocated and free chunk
  in the course of computing the summaries.

ABORT_ON_ASSERT_FAILURE   default: defined as 1 (true)
  Debugging assertion failures can be nearly impossible if your
  version of the assert macro causes malloc to be called, which will
  lead to a cascade of further failures, blowing the runtime stack.
  ABORT_ON_ASSERT_FAILURE cause assertions failures to call abort(),
  which will usually make debugging easier.

MALLOC_FAILURE_ACTION     default: sets errno to ENOMEM, or no-op on win32
  The action to take before "return 0" when malloc fails to be able to
  return memory because there is none available.

HAVE_MORECORE             default: 1 (true) unless win32 or ONLY_MSPACES
  True if this system supports sbrk or an emulation of it.

MORECORE                  default: sbrk
  The name of the sbrk-style system routine to call to obtain more
  memory.  See below for guidance on writing custom MORECORE
  functions. The type of the argument to sbrk/MORECORE varies across
  systems.  It cannot be size_t, because it supports negative
  arguments, so it is normally the signed type of the same width as
  size_t (sometimes declared as "intptr_t").  It doesn't much matter
  though. Internally, we only call it with arguments less than half
  the max value of a size_t, which should work across all reasonable
  possibilities, although sometimes generating compiler warnings.

MORECORE_CONTIGUOUS       default: 1 (true) if HAVE_MORECORE
  If true, take advantage of fact that consecutive calls to MORECORE
  with positive arguments always return contiguous increasing
  addresses.  This is true of unix sbrk. It does not hurt too much to
  set it true anyway, since malloc copes with non-contiguities.
  Setting it false when definitely non-contiguous saves time
  and possibly wasted space it would take to discover this though.

MORECORE_CANNOT_TRIM      default: NOT defined
  True if MORECORE cannot release space back to the system when given
  negative arguments. This is generally necessary only if you are
  using a hand-crafted MORECORE function that cannot handle negative
  arguments.

NO_SEGMENT_TRAVERSAL       default: 0
  If non-zero, suppresses traversals of memory segments
  returned by either MORECORE or CALL_MMAP. This disables
  merging of segments that are contiguous, and selectively
  releasing them to the OS if unused, but bounds execution times.

HAVE_MMAP                 default: 1 (true)
  True if this system supports mmap or an emulation of it.  If so, and
  HAVE_MORECORE is not true, MMAP is used for all system
  allocation. If set and HAVE_MORECORE is true as well, MMAP is
  primarily used to directly allocate very large blocks. It is also
  used as a backup strategy in cases where MORECORE fails to provide
  space from system. Note: A single call to MUNMAP is assumed to be
  able to unmap memory that may have be allocated using multiple calls
  to MMAP, so long as they are adjacent.

HAVE_MREMAP               default: 1 on linux, else 0
  If true realloc() uses mremap() to re-allocate large blocks and
  extend or shrink allocation spaces.

MMAP_CLEARS               default: 1 except on WINCE.
  True if mmap clears memory so calloc doesn't need to. This is true
  for standard unix mmap using /dev/zero and on WIN32 except for WINCE.

USE_BUILTIN_FFS            default: 0 (i.e., not used)
  Causes malloc to use the builtin ffs() function to compute indices.
  Some compilers may recognize and intrinsify ffs to be faster than the
  supplied C version. Also, the case of x86 using gcc is special-cased
  to an asm instruction, so is already as fast as it can be, and so
  this setting has no effect. Similarly for Win32 under recent MS compilers.
  (On most x86s, the asm version is only slightly faster than the C version.)

malloc_getpagesize         default: derive from system includes, or 4096.
  The system page size. To the extent possible, this malloc manages
  memory from the system in page-size units.  This may be (and
  usually is) a function rather than a constant. This is ignored
  if WIN32, where page size is determined using getSystemInfo during
  initialization.

USE_DEV_RANDOM             default: 0 (i.e., not used)
  Causes malloc to use /dev/random to initialize secure magic seed for
  stamping footers. Otherwise, the current time is used.

NO_MALLINFO                default: 0
  If defined, don't compile "mallinfo". This can be a simple way
  of dealing with mismatches between system declarations and
  those in this file.

MALLINFO_FIELD_TYPE        default: size_t
  The type of the fields in the mallinfo struct. This was originally
  defined as "int" in SVID etc, but is more usefully defined as
  size_t. The value is used only if  HAVE_USR_INCLUDE_MALLOC_H is not set

NO_MALLOC_STATS            default: 0
  If defined, don't compile "malloc_stats". This avoids calls to
  fprintf and bringing in stdio dependencies you might not want.

REALLOC_ZERO_BYTES_FREES    default: not defined
  This should be set if a call to realloc with zero bytes should
  be the same as a call to free. Some people think it should. Otherwise,
  since this malloc returns a unique pointer for malloc(0), so does
  realloc(p, 0).

LACKS_UNISTD_H, LACKS_FCNTL_H, LACKS_SYS_PARAM_H, LACKS_SYS_MMAN_H
LACKS_STRINGS_H, LACKS_STRING_H, LACKS_SYS_TYPES_H,  LACKS_ERRNO_H
LACKS_STDLIB_H LACKS_SCHED_H LACKS_TIME_H  default: NOT defined unless on WIN32
  Define these if your system does not have these header files.
  You might need to manually insert some of the declarations they provide.

DEFAULT_GRANULARITY        default: page size if MORECORE_CONTIGUOUS,
                                system_info.dwAllocationGranularity in WIN32,
                                otherwise 64K.
      Also settable using mallopt(M_GRANULARITY, x)
  The unit for allocating and deallocating memory from the system.  On
  most systems with contiguous MORECORE, there is no reason to
  make this more than a page. However, systems with MMAP tend to
  either require or encourage larger granularities.  You can increase
  this value to prevent system allocation functions to be called so
  often, especially if they are slow.  The value must be at least one
  page and must be a power of two.  Setting to 0 causes initialization
  to either page size or win32 region size.  (Note: In previous
  versions of malloc, the equivalent of this option was called
  "TOP_PAD")

DEFAULT_TRIM_THRESHOLD    default: 2MB
      Also settable using mallopt(M_TRIM_THRESHOLD, x)
  The maximum amount of unused top-most memory to keep before
  releasing via malloc_trim in free().  Automatic trimming is mainly
  useful in long-lived programs using contiguous MORECORE.  Because
  trimming via sbrk can be slow on some systems, and can sometimes be
  wasteful (in cases where programs immediately afterward allocate
  more large chunks) the value should be high enough so that your
  overall system performance would improve by releasing this much
  memory.  As a rough guide, you might set to a value close to the
  average size of a process (program) running on your system.
  Releasing this much memory would allow such a process to run in
  memory.  Generally, it is worth tuning trim thresholds when a
  program undergoes phases where several large chunks are allocated
  and released in ways that can reuse each other's storage, perhaps
  mixed with phases where there are no such chunks at all. The trim
  value must be greater than page size to have any useful effect.  To
  disable trimming completely, you can set to MAX_SIZE_T. Note that the trick
  some people use of mallocing a huge space and then freeing it at
  program startup, in an attempt to reserve system memory, doesn't
  have the intended effect under automatic trimming, since that memory
  will immediately be returned to the system.

DEFAULT_MMAP_THRESHOLD       default: 256K
      Also settable using mallopt(M_MMAP_THRESHOLD, x)
  The request size threshold for using MMAP to directly service a
  request. Requests of at least this size that cannot be allocated
  using already-existing space will be serviced via mmap.  (If enough
  normal freed space already exists it is used instead.)  Using mmap
  segregates relatively large chunks of memory so that they can be
  individually obtained and released from the host system. A request
  serviced through mmap is never reused by any other request (at least
  not directly; the system may just so happen to remap successive
  requests to the same locations).  Segregating space in this way has
  the benefits that: Mmapped space can always be individually released
  back to the system, which helps keep the system level memory demands
  of a long-lived program low.  Also, mapped memory doesn't become
  `locked' between other chunks, as can happen with normally allocated
  chunks, which means that even trimming via malloc_trim would not
  release them.  However, it has the disadvantage that the space
  cannot be reclaimed, consolidated, and then used to service later
  requests, as happens with normal chunks.  The advantages of mmap
  nearly always outweigh disadvantages for "large" chunks, but the
  value of "large" may vary across systems.  The default is an
  empirically derived value that works well in most systems. You can
  disable mmap by setting to MAX_SIZE_T.

MAX_RELEASE_CHECK_RATE   default: 4095 unless not HAVE_MMAP
  The number of consolidated frees between checks to release
  unused segments when freeing. When using non-contiguous segments,
  especially with multiple mspaces, checking only for topmost space
  doesn't always suffice to trigger trimming. To compensate for this,
  free() will, with a period of MAX_RELEASE_CHECK_RATE (or the
  current number of segments, if greater) try to release unused
  segments to the OS when freeing chunks that result in
  consolidation. The best value for this parameter is a compromise
  between slowing down frees with relatively costly checks that
  rarely trigger versus holding on to unused memory. To effectively
  disable, set to MAX_SIZE_T. This may lead to a very slight speed
  improvement at the expense of carrying around more memory.
*/
#include "RomulusLog.hpp"
//modification by Andreia
#define ONLY_MSPACES 1

/* Version identifier to allow people to support multiple versions */
#ifndef DLMALLOC_VERSION
#define DLMALLOC_VERSION 20806
#endif /* DLMALLOC_VERSION */

#ifndef DLMALLOC_EXPORT
#define DLMALLOC_EXPORT extern
#endif

#ifndef WIN32
#ifdef _WIN32
#define WIN32 1
#endif  /* _WIN32 */
#ifdef _WIN32_WCE
#define LACKS_FCNTL_H
#define WIN32 1
#endif /* _WIN32_WCE */
#endif  /* WIN32 */
#ifdef WIN32
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#include <tchar.h>
#define HAVE_MMAP 1
#define HAVE_MORECORE 0
#define LACKS_UNISTD_H
#define LACKS_SYS_PARAM_H
#define LACKS_SYS_MMAN_H
#define LACKS_STRING_H
#define LACKS_STRINGS_H
#define LACKS_SYS_TYPES_H
#define LACKS_ERRNO_H
#define LACKS_SCHED_H
#ifndef MALLOC_FAILURE_ACTION
#define MALLOC_FAILURE_ACTION
#endif /* MALLOC_FAILURE_ACTION */
#ifndef MMAP_CLEARS
#ifdef _WIN32_WCE /* WINCE reportedly does not clear */
#define MMAP_CLEARS 0
#else
#define MMAP_CLEARS 1
#endif /* _WIN32_WCE */
#endif /*MMAP_CLEARS */
#endif  /* WIN32 */

#if defined(DARWIN) || defined(_DARWIN)
/* Mac OSX docs advise not to use sbrk; it seems better to use mmap */
#ifndef HAVE_MORECORE
#define HAVE_MORECORE 0
#define HAVE_MMAP 1
/* OSX allocators provide 16 byte alignment */
#ifndef MALLOC_ALIGNMENT
#define MALLOC_ALIGNMENT ((size_t)16U)
#endif
#endif  /* HAVE_MORECORE */
#endif  /* DARWIN */

#ifndef LACKS_SYS_TYPES_H
#include <sys/types.h>  /* For size_t */
#endif  /* LACKS_SYS_TYPES_H */

/* The maximum possible size_t value has all bits set */
#define MAX_SIZE_T           (~(size_t)0)

#ifndef USE_LOCKS /* ensure true if spin or recursive locks set */
#define USE_LOCKS  ((defined(USE_SPIN_LOCKS) && USE_SPIN_LOCKS != 0) || \
                    (defined(USE_RECURSIVE_LOCKS) && USE_RECURSIVE_LOCKS != 0))
#endif /* USE_LOCKS */

#if USE_LOCKS /* Spin locks for gcc >= 4.1, older gcc on x86, MSC >= 1310 */
#if ((defined(__GNUC__) &&                                              \
      ((__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1)) ||      \
       defined(__i386__) || defined(__x86_64__))) ||                    \
     (defined(_MSC_VER) && _MSC_VER>=1310))
#ifndef USE_SPIN_LOCKS
#define USE_SPIN_LOCKS 1
#endif /* USE_SPIN_LOCKS */
#elif USE_SPIN_LOCKS
#error "USE_SPIN_LOCKS defined without implementation"
#endif /* ... locks available... */
#elif !defined(USE_SPIN_LOCKS)
#define USE_SPIN_LOCKS 0
#endif /* USE_LOCKS */

#ifndef ONLY_MSPACES
#define ONLY_MSPACES 0
#endif  /* ONLY_MSPACES */
#ifndef MSPACES
#if ONLY_MSPACES
#define MSPACES 1
#else   /* ONLY_MSPACES */
#define MSPACES 0
#endif  /* ONLY_MSPACES */
#endif  /* MSPACES */
#ifndef MALLOC_ALIGNMENT
#define MALLOC_ALIGNMENT ((size_t)(2 * sizeof(void *)))
#endif  /* MALLOC_ALIGNMENT */
#ifndef FOOTERS
#define FOOTERS 0
#endif  /* FOOTERS */
#ifndef ABORT
#define ABORT  abort()
#endif  /* ABORT */
#ifndef ABORT_ON_ASSERT_FAILURE
#define ABORT_ON_ASSERT_FAILURE 1
#endif  /* ABORT_ON_ASSERT_FAILURE */
#ifndef PROCEED_ON_ERROR
#define PROCEED_ON_ERROR 0
#endif  /* PROCEED_ON_ERROR */

#ifndef INSECURE
#define INSECURE 0
#endif  /* INSECURE */
#ifndef MALLOC_INSPECT_ALL
#define MALLOC_INSPECT_ALL 0
#endif  /* MALLOC_INSPECT_ALL */
#ifndef HAVE_MMAP
#define HAVE_MMAP 1
#endif  /* HAVE_MMAP */
#ifndef MMAP_CLEARS
#define MMAP_CLEARS 1
#endif  /* MMAP_CLEARS */
#ifndef HAVE_MREMAP
#ifdef linux
#define HAVE_MREMAP 1
#define _GNU_SOURCE /* Turns on mremap() definition */
#else   /* linux */
#define HAVE_MREMAP 0
#endif  /* linux */
#endif  /* HAVE_MREMAP */
#ifndef MALLOC_FAILURE_ACTION
#define MALLOC_FAILURE_ACTION  errno = ENOMEM;
#endif  /* MALLOC_FAILURE_ACTION */
#ifndef HAVE_MORECORE
#if ONLY_MSPACES
#define HAVE_MORECORE 0
#else   /* ONLY_MSPACES */
#define HAVE_MORECORE 1
#endif  /* ONLY_MSPACES */
#endif  /* HAVE_MORECORE */
#if !HAVE_MORECORE
#define MORECORE_CONTIGUOUS 0
#else   /* !HAVE_MORECORE */
#define MORECORE_DEFAULT sbrk
#ifndef MORECORE_CONTIGUOUS
#define MORECORE_CONTIGUOUS 1
#endif  /* MORECORE_CONTIGUOUS */
#endif  /* HAVE_MORECORE */
#ifndef DEFAULT_GRANULARITY
#if (MORECORE_CONTIGUOUS || defined(WIN32))
#define DEFAULT_GRANULARITY (0)  /* 0 means to compute in init_mparams */
#else   /* MORECORE_CONTIGUOUS */
#define DEFAULT_GRANULARITY ((size_t)64U * (size_t)1024U)
#endif  /* MORECORE_CONTIGUOUS */
#endif  /* DEFAULT_GRANULARITY */
#ifndef DEFAULT_TRIM_THRESHOLD
#ifndef MORECORE_CANNOT_TRIM
#define DEFAULT_TRIM_THRESHOLD ((size_t)2U * (size_t)1024U * (size_t)1024U)
#else   /* MORECORE_CANNOT_TRIM */
#define DEFAULT_TRIM_THRESHOLD MAX_SIZE_T
#endif  /* MORECORE_CANNOT_TRIM */
#endif  /* DEFAULT_TRIM_THRESHOLD */
#ifndef DEFAULT_MMAP_THRESHOLD
#if HAVE_MMAP
#define DEFAULT_MMAP_THRESHOLD ((size_t)256U * (size_t)1024U)
#else   /* HAVE_MMAP */
#define DEFAULT_MMAP_THRESHOLD MAX_SIZE_T
#endif  /* HAVE_MMAP */
#endif  /* DEFAULT_MMAP_THRESHOLD */
#ifndef MAX_RELEASE_CHECK_RATE
#if HAVE_MMAP
#define MAX_RELEASE_CHECK_RATE 4095
#else
#define MAX_RELEASE_CHECK_RATE MAX_SIZE_T
#endif /* HAVE_MMAP */
#endif /* MAX_RELEASE_CHECK_RATE */
#ifndef USE_BUILTIN_FFS
#define USE_BUILTIN_FFS 0
#endif  /* USE_BUILTIN_FFS */
#ifndef USE_DEV_RANDOM
#define USE_DEV_RANDOM 0
#endif  /* USE_DEV_RANDOM */
#ifndef NO_MALLINFO
#define NO_MALLINFO 0
#endif  /* NO_MALLINFO */
#ifndef MALLINFO_FIELD_TYPE
#define MALLINFO_FIELD_TYPE size_t
#endif  /* MALLINFO_FIELD_TYPE */
#ifndef NO_MALLOC_STATS
#define NO_MALLOC_STATS 0
#endif  /* NO_MALLOC_STATS */
#ifndef NO_SEGMENT_TRAVERSAL
#define NO_SEGMENT_TRAVERSAL 0
#endif /* NO_SEGMENT_TRAVERSAL */

/*
  mallopt tuning options.  SVID/XPG defines four standard parameter
  numbers for mallopt, normally defined in malloc.h.  None of these
  are used in this malloc, so setting them has no effect. But this
  malloc does support the following options.
*/

#define M_TRIM_THRESHOLD     (-1)
#define M_GRANULARITY        (-2)
#define M_MMAP_THRESHOLD     (-3)

/* ------------------------ Mallinfo declarations ------------------------ */

#if !NO_MALLINFO
/*
  This version of malloc supports the standard SVID/XPG mallinfo
  routine that returns a struct containing usage properties and
  statistics. It should work on any system that has a
  /usr/include/malloc.h defining struct mallinfo.  The main
  declaration needed is the mallinfo struct that is returned (by-copy)
  by mallinfo().  The malloinfo struct contains a bunch of fields that
  are not even meaningful in this version of malloc.  These fields are
  are instead filled by mallinfo() with other numbers that might be of
  interest.

  HAVE_USR_INCLUDE_MALLOC_H should be set if you have a
  /usr/include/malloc.h file that includes a declaration of struct
  mallinfo.  If so, it is included; else a compliant version is
  declared below.  These must be precisely the same for mallinfo() to
  work.  The original SVID version of this struct, defined on most
  systems with mallinfo, declares all fields as ints. But some others
  define as unsigned long. If your system defines the fields using a
  type of different width than listed here, you MUST #include your
  system version and #define HAVE_USR_INCLUDE_MALLOC_H.
*/

/* #define HAVE_USR_INCLUDE_MALLOC_H */

#ifdef HAVE_USR_INCLUDE_MALLOC_H
#include "/usr/include/malloc.h"
#else /* HAVE_USR_INCLUDE_MALLOC_H */
#ifndef STRUCT_MALLINFO_DECLARED
/* HP-UX (and others?) redefines mallinfo unless _STRUCT_MALLINFO is defined */
#define _STRUCT_MALLINFO
#define STRUCT_MALLINFO_DECLARED 1
struct mallinfo {
  MALLINFO_FIELD_TYPE arena;    /* non-mmapped space allocated from system */
  MALLINFO_FIELD_TYPE ordblks;  /* number of free chunks */
  MALLINFO_FIELD_TYPE smblks;   /* always 0 */
  MALLINFO_FIELD_TYPE hblks;    /* always 0 */
  MALLINFO_FIELD_TYPE hblkhd;   /* space in mmapped regions */
  MALLINFO_FIELD_TYPE usmblks;  /* maximum total allocated space */
  MALLINFO_FIELD_TYPE fsmblks;  /* always 0 */
  MALLINFO_FIELD_TYPE uordblks; /* total allocated space */
  MALLINFO_FIELD_TYPE fordblks; /* total free space */
  MALLINFO_FIELD_TYPE keepcost; /* releasable (via malloc_trim) space */
};
#endif /* STRUCT_MALLINFO_DECLARED */
#endif /* HAVE_USR_INCLUDE_MALLOC_H */
#endif /* NO_MALLINFO */

/*
  Try to persuade compilers to inline. The most critical functions for
  inlining are defined as macros, so these aren't used for them.
*/

#ifndef FORCEINLINE
  #if defined(__GNUC__)
#define FORCEINLINE __inline __attribute__ ((always_inline))
  #elif defined(_MSC_VER)
    #define FORCEINLINE __forceinline
  #endif
#endif
#ifndef NOINLINE
  #if defined(__GNUC__)
    #define NOINLINE __attribute__ ((noinline))
  #elif defined(_MSC_VER)
    #define NOINLINE __declspec(noinline)
  #else
    #define NOINLINE
  #endif
#endif

#ifdef __cplusplus
extern "C" {
#ifndef FORCEINLINE
 #define FORCEINLINE inline
#endif
#endif /* __cplusplus */
#ifndef FORCEINLINE
 #define FORCEINLINE
#endif

#if !ONLY_MSPACES

/* ------------------- Declarations of public routines ------------------- */

#ifndef USE_DL_PREFIX
#define dlcalloc               calloc
#define dlfree                 free
#define dlmalloc               malloc
#define dlmemalign             memalign
#define dlposix_memalign       posix_memalign
#define dlrealloc              realloc
#define dlrealloc_in_place     realloc_in_place
#define dlvalloc               valloc
#define dlpvalloc              pvalloc
#define dlmallinfo             mallinfo
#define dlmallopt              mallopt
#define dlmalloc_trim          malloc_trim
#define dlmalloc_stats         malloc_stats
#define dlmalloc_usable_size   malloc_usable_size
#define dlmalloc_footprint     malloc_footprint
#define dlmalloc_max_footprint malloc_max_footprint
#define dlmalloc_footprint_limit malloc_footprint_limit
#define dlmalloc_set_footprint_limit malloc_set_footprint_limit
#define dlmalloc_inspect_all   malloc_inspect_all
#define dlindependent_calloc   independent_calloc
#define dlindependent_comalloc independent_comalloc
#define dlbulk_free            bulk_free
#endif /* USE_DL_PREFIX */

/*
  malloc(size_t n)
  Returns a pointer to a newly allocated chunk of at least n bytes, or
  null if no space is available, in which case errno is set to ENOMEM
  on ANSI C systems.

  If n is zero, malloc returns a minimum-sized chunk. (The minimum
  size is 16 bytes on most 32bit systems, and 32 bytes on 64bit
  systems.)  Note that size_t is an unsigned type, so calls with
  arguments that would be negative if signed are interpreted as
  requests for huge amounts of space, which will often fail. The
  maximum supported value of n differs across systems, but is in all
  cases less than the maximum representable value of a size_t.
*/
DLMALLOC_EXPORT void* dlmalloc(size_t);

/*
  free(void* p)
  Releases the chunk of memory pointed to by p, that had been previously
  allocated using malloc or a related routine such as realloc.
  It has no effect if p is null. If p was not malloced or already
  freed, free(p) will by default cause the current program to abort.
*/
DLMALLOC_EXPORT void  dlfree(void*);

/*
  calloc(size_t n_elements, size_t element_size);
  Returns a pointer to n_elements * element_size bytes, with all locations
  set to zero.
*/
DLMALLOC_EXPORT void* dlcalloc(size_t, size_t);

/*
  realloc(void* p, size_t n)
  Returns a pointer to a chunk of size n that contains the same data
  as does chunk p up to the minimum of (n, p's size) bytes, or null
  if no space is available.

  The returned pointer may or may not be the same as p. The algorithm
  prefers extending p in most cases when possible, otherwise it
  employs the equivalent of a malloc-copy-free sequence.

  If p is null, realloc is equivalent to malloc.

  If space is not available, realloc returns null, errno is set (if on
  ANSI) and p is NOT freed.

  if n is for fewer bytes than already held by p, the newly unused
  space is lopped off and freed if possible.  realloc with a size
  argument of zero (re)allocates a minimum-sized chunk.

  The old unix realloc convention of allowing the last-free'd chunk
  to be used as an argument to realloc is not supported.
*/
DLMALLOC_EXPORT void* dlrealloc(void*, size_t);

/*
  realloc_in_place(void* p, size_t n)
  Resizes the space allocated for p to size n, only if this can be
  done without moving p (i.e., only if there is adjacent space
  available if n is greater than p's current allocated size, or n is
  less than or equal to p's size). This may be used instead of plain
  realloc if an alternative allocation strategy is needed upon failure
  to expand space; for example, reallocation of a buffer that must be
  memory-aligned or cleared. You can use realloc_in_place to trigger
  these alternatives only when needed.

  Returns p if successful; otherwise null.
*/
DLMALLOC_EXPORT void* dlrealloc_in_place(void*, size_t);

/*
  memalign(size_t alignment, size_t n);
  Returns a pointer to a newly allocated chunk of n bytes, aligned
  in accord with the alignment argument.

  The alignment argument should be a power of two. If the argument is
  not a power of two, the nearest greater power is used.
  8-byte alignment is guaranteed by normal malloc calls, so don't
  bother calling memalign with an argument of 8 or less.

  Overreliance on memalign is a sure way to fragment space.
*/
DLMALLOC_EXPORT void* dlmemalign(size_t, size_t);

/*
  int posix_memalign(void** pp, size_t alignment, size_t n);
  Allocates a chunk of n bytes, aligned in accord with the alignment
  argument. Differs from memalign only in that it (1) assigns the
  allocated memory to *pp rather than returning it, (2) fails and
  returns EINVAL if the alignment is not a power of two (3) fails and
  returns ENOMEM if memory cannot be allocated.
*/
DLMALLOC_EXPORT int dlposix_memalign(void**, size_t, size_t);

/*
  valloc(size_t n);
  Equivalent to memalign(pagesize, n), where pagesize is the page
  size of the system. If the pagesize is unknown, 4096 is used.
*/
DLMALLOC_EXPORT void* dlvalloc(size_t);

/*
  mallopt(int parameter_number, int parameter_value)
  Sets tunable parameters The format is to provide a
  (parameter-number, parameter-value) pair.  mallopt then sets the
  corresponding parameter to the argument value if it can (i.e., so
  long as the value is meaningful), and returns 1 if successful else
  0.  To workaround the fact that mallopt is specified to use int,
  not size_t parameters, the value -1 is specially treated as the
  maximum unsigned size_t value.

  SVID/XPG/ANSI defines four standard param numbers for mallopt,
  normally defined in malloc.h.  None of these are use in this malloc,
  so setting them has no effect. But this malloc also supports other
  options in mallopt. See below for details.  Briefly, supported
  parameters are as follows (listed defaults are for "typical"
  configurations).

  Symbol            param #  default    allowed param values
  M_TRIM_THRESHOLD     -1   2*1024*1024   any   (-1 disables)
  M_GRANULARITY        -2     page size   any power of 2 >= page size
  M_MMAP_THRESHOLD     -3      256*1024   any   (or 0 if no MMAP support)
*/
DLMALLOC_EXPORT int dlmallopt(int, int);

/*
  malloc_footprint();
  Returns the number of bytes obtained from the system.  The total
  number of bytes allocated by malloc, realloc etc., is less than this
  value. Unlike mallinfo, this function returns only a precomputed
  result, so can be called frequently to monitor memory consumption.
  Even if locks are otherwise defined, this function does not use them,
  so results might not be up to date.
*/
DLMALLOC_EXPORT size_t dlmalloc_footprint(void);

/*
  malloc_max_footprint();
  Returns the maximum number of bytes obtained from the system. This
  value will be greater than current footprint if deallocated space
  has been reclaimed by the system. The peak number of bytes allocated
  by malloc, realloc etc., is less than this value. Unlike mallinfo,
  this function returns only a precomputed result, so can be called
  frequently to monitor memory consumption.  Even if locks are
  otherwise defined, this function does not use them, so results might
  not be up to date.
*/
DLMALLOC_EXPORT size_t dlmalloc_max_footprint(void);

/*
  malloc_footprint_limit();
  Returns the number of bytes that the heap is allowed to obtain from
  the system, returning the last value returned by
  malloc_set_footprint_limit, or the maximum size_t value if
  never set. The returned value reflects a permission. There is no
  guarantee that this number of bytes can actually be obtained from
  the system.
*/
DLMALLOC_EXPORT size_t dlmalloc_footprint_limit();

/*
  malloc_set_footprint_limit();
  Sets the maximum number of bytes to obtain from the system, causing
  failure returns from malloc and related functions upon attempts to
  exceed this value. The argument value may be subject to page
  rounding to an enforceable limit; this actual value is returned.
  Using an argument of the maximum possible size_t effectively
  disables checks. If the argument is less than or equal to the
  current malloc_footprint, then all future allocations that require
  additional system memory will fail. However, invocation cannot
  retroactively deallocate existing used memory.
*/
DLMALLOC_EXPORT size_t dlmalloc_set_footprint_limit(size_t bytes);

#if MALLOC_INSPECT_ALL
/*
  malloc_inspect_all(void(*handler)(void *start,
                                    void *end,
                                    size_t used_bytes,
                                    void* callback_arg),
                      void* arg);
  Traverses the heap and calls the given handler for each managed
  region, skipping all bytes that are (or may be) used for bookkeeping
  purposes.  Traversal does not include include chunks that have been
  directly memory mapped. Each reported region begins at the start
  address, and continues up to but not including the end address.  The
  first used_bytes of the region contain allocated data. If
  used_bytes is zero, the region is unallocated. The handler is
  invoked with the given callback argument. If locks are defined, they
  are held during the entire traversal. It is a bad idea to invoke
  other malloc functions from within the handler.

  For example, to count the number of in-use chunks with size greater
  than 1000, you could write:
  static int count = 0;
  void count_chunks(void* start, void* end, size_t used, void* arg) {
    if (used >= 1000) ++count;
  }
  then:
    malloc_inspect_all(count_chunks, NULL);

  malloc_inspect_all is compiled only if MALLOC_INSPECT_ALL is defined.
*/
DLMALLOC_EXPORT void dlmalloc_inspect_all(void(*handler)(void*, void *, size_t, void*),
                           void* arg);

#endif /* MALLOC_INSPECT_ALL */

#if !NO_MALLINFO
/*
  mallinfo()
  Returns (by copy) a struct containing various summary statistics:

  arena:     current total non-mmapped bytes allocated from system
  ordblks:   the number of free chunks
  smblks:    always zero.
  hblks:     current number of mmapped regions
  hblkhd:    total bytes held in mmapped regions
  usmblks:   the maximum total allocated space. This will be greater
                than current total if trimming has occurred.
  fsmblks:   always zero
  uordblks:  current total allocated space (normal or mmapped)
  fordblks:  total free space
  keepcost:  the maximum number of bytes that could ideally be released
               back to system via malloc_trim. ("ideally" means that
               it ignores page restrictions etc.)

  Because these fields are ints, but internal bookkeeping may
  be kept as longs, the reported values may wrap around zero and
  thus be inaccurate.
*/
DLMALLOC_EXPORT struct mallinfo dlmallinfo(void);
#endif /* NO_MALLINFO */

/*
  independent_calloc(size_t n_elements, size_t element_size, void* chunks[]);

  independent_calloc is similar to calloc, but instead of returning a
  single cleared space, it returns an array of pointers to n_elements
  independent elements that can hold contents of size elem_size, each
  of which starts out cleared, and can be independently freed,
  realloc'ed etc. The elements are guaranteed to be adjacently
  allocated (this is not guaranteed to occur with multiple callocs or
  mallocs), which may also improve cache locality in some
  applications.

  The "chunks" argument is optional (i.e., may be null, which is
  probably the most typical usage). If it is null, the returned array
  is itself dynamically allocated and should also be freed when it is
  no longer needed. Otherwise, the chunks array must be of at least
  n_elements in length. It is filled in with the pointers to the
  chunks.

  In either case, independent_calloc returns this pointer array, or
  null if the allocation failed.  If n_elements is zero and "chunks"
  is null, it returns a chunk representing an array with zero elements
  (which should be freed if not wanted).

  Each element must be freed when it is no longer needed. This can be
  done all at once using bulk_free.

  independent_calloc simplifies and speeds up implementations of many
  kinds of pools.  It may also be useful when constructing large data
  structures that initially have a fixed number of fixed-sized nodes,
  but the number is not known at compile time, and some of the nodes
  may later need to be freed. For example:

  struct Node { int item; struct Node* next; };

  struct Node* build_list() {
    struct Node** pool;
    int n = read_number_of_nodes_needed();
    if (n <= 0) return 0;
    pool = (struct Node**)(independent_calloc(n, sizeof(struct Node), 0);
    if (pool == 0) die();
    // organize into a linked list...
    struct Node* first = pool[0];
    for (i = 0; i < n-1; ++i)
      pool[i]->next = pool[i+1];
    free(pool);     // Can now free the array (or not, if it is needed later)
    return first;
  }
*/
DLMALLOC_EXPORT void** dlindependent_calloc(size_t, size_t, void**);

/*
  independent_comalloc(size_t n_elements, size_t sizes[], void* chunks[]);

  independent_comalloc allocates, all at once, a set of n_elements
  chunks with sizes indicated in the "sizes" array.    It returns
  an array of pointers to these elements, each of which can be
  independently freed, realloc'ed etc. The elements are guaranteed to
  be adjacently allocated (this is not guaranteed to occur with
  multiple callocs or mallocs), which may also improve cache locality
  in some applications.

  The "chunks" argument is optional (i.e., may be null). If it is null
  the returned array is itself dynamically allocated and should also
  be freed when it is no longer needed. Otherwise, the chunks array
  must be of at least n_elements in length. It is filled in with the
  pointers to the chunks.

  In either case, independent_comalloc returns this pointer array, or
  null if the allocation failed.  If n_elements is zero and chunks is
  null, it returns a chunk representing an array with zero elements
  (which should be freed if not wanted).

  Each element must be freed when it is no longer needed. This can be
  done all at once using bulk_free.

  independent_comallac differs from independent_calloc in that each
  element may have a different size, and also that it does not
  automatically clear elements.

  independent_comalloc can be used to speed up allocation in cases
  where several structs or objects must always be allocated at the
  same time.  For example:

  struct Head { ... }
  struct Foot { ... }

  void send_message(char* msg) {
    int msglen = strlen(msg);
    size_t sizes[3] = { sizeof(struct Head), msglen, sizeof(struct Foot) };
    void* chunks[3];
    if (independent_comalloc(3, sizes, chunks) == 0)
      die();
    struct Head* head = (struct Head*)(chunks[0]);
    char*        body = (char*)(chunks[1]);
    struct Foot* foot = (struct Foot*)(chunks[2]);
    // ...
  }

  In general though, independent_comalloc is worth using only for
  larger values of n_elements. For small values, you probably won't
  detect enough difference from series of malloc calls to bother.

  Overuse of independent_comalloc can increase overall memory usage,
  since it cannot reuse existing noncontiguous small chunks that
  might be available for some of the elements.
*/
DLMALLOC_EXPORT void** dlindependent_comalloc(size_t, size_t*, void**);

/*
  bulk_free(void* array[], size_t n_elements)
  Frees and clears (sets to null) each non-null pointer in the given
  array.  This is likely to be faster than freeing them one-by-one.
  If footers are used, pointers that have been allocated in different
  mspaces are not freed or cleared, and the count of all such pointers
  is returned.  For large arrays of pointers with poor locality, it
  may be worthwhile to sort this array before calling bulk_free.
*/
DLMALLOC_EXPORT size_t  dlbulk_free(void**, size_t n_elements);

/*
  pvalloc(size_t n);
  Equivalent to valloc(minimum-page-that-holds(n)), that is,
  round up n to nearest pagesize.
 */
DLMALLOC_EXPORT void*  dlpvalloc(size_t);

/*
  malloc_trim(size_t pad);

  If possible, gives memory back to the system (via negative arguments
  to sbrk) if there is unused memory at the `high' end of the malloc
  pool or in unused MMAP segments. You can call this after freeing
  large blocks of memory to potentially reduce the system-level memory
  requirements of a program. However, it cannot guarantee to reduce
  memory. Under some allocation patterns, some large free blocks of
  memory will be locked between two used chunks, so they cannot be
  given back to the system.

  The `pad' argument to malloc_trim represents the amount of free
  trailing space to leave untrimmed. If this argument is zero, only
  the minimum amount of memory to maintain internal data structures
  will be left. Non-zero arguments can be supplied to maintain enough
  trailing space to service future expected allocations without having
  to re-obtain memory from the system.

  Malloc_trim returns 1 if it actually released any memory, else 0.
*/
DLMALLOC_EXPORT int  dlmalloc_trim(size_t);

/*
  malloc_stats();
  Prints on stderr the amount of space obtained from the system (both
  via sbrk and mmap), the maximum amount (which may be more than
  current if malloc_trim and/or munmap got called), and the current
  number of bytes allocated via malloc (or realloc, etc) but not yet
  freed. Note that this is the number of bytes allocated, not the
  number requested. It will be larger than the number requested
  because of alignment and bookkeeping overhead. Because it includes
  alignment wastage as being in use, this figure may be greater than
  zero even when no user-level chunks are allocated.

  The reported current and maximum system memory can be inaccurate if
  a program makes other calls to system memory allocation functions
  (normally sbrk) outside of malloc.

  malloc_stats prints only the most commonly interesting statistics.
  More information can be obtained by calling mallinfo.
*/
DLMALLOC_EXPORT void  dlmalloc_stats(void);

/*
  malloc_usable_size(void* p);

  Returns the number of bytes you can actually use in
  an allocated chunk, which may be more than you requested (although
  often not) due to alignment and minimum size constraints.
  You can use this many bytes without worrying about
  overwriting other allocated objects. This is not a particularly great
  programming practice. malloc_usable_size can be more useful in
  debugging and assertions, for example:

  p = malloc(n);
  assert(malloc_usable_size(p) >= 256);
*/
size_t dlmalloc_usable_size(void*);

#endif /* ONLY_MSPACES */

#if MSPACES

/*
  mspace is an opaque type representing an independent
  region of space that supports mspace_malloc, etc.
*/
typedef void* mspace;

/*
  create_mspace creates and returns a new independent space with the
  given initial capacity, or, if 0, the default granularity size.  It
  returns null if there is no system memory available to create the
  space.  If argument locked is non-zero, the space uses a separate
  lock to control access. The capacity of the space will grow
  dynamically as needed to service mspace_malloc requests.  You can
  control the sizes of incremental increases of this space by
  compiling with a different DEFAULT_GRANULARITY or dynamically
  setting with mallopt(M_GRANULARITY, value).
*/
DLMALLOC_EXPORT mspace create_mspace(size_t capacity, int locked);

/*
  destroy_mspace destroys the given space, and attempts to return all
  of its memory back to the system, returning the total number of
  bytes freed. After destruction, the results of access to all memory
  used by the space become undefined.
*/
DLMALLOC_EXPORT size_t destroy_mspace(mspace msp);

/*
  create_mspace_with_base uses the memory supplied as the initial base
  of a new mspace. Part (less than 128*sizeof(size_t) bytes) of this
  space is used for bookkeeping, so the capacity must be at least this
  large. (Otherwise 0 is returned.) When this initial space is
  exhausted, additional memory will be obtained from the system.
  Destroying this space will deallocate all additionally allocated
  space (if possible) but not the initial base.
*/
DLMALLOC_EXPORT mspace create_mspace_with_base(void* base, size_t capacity, int locked);

/*
  mspace_track_large_chunks controls whether requests for large chunks
  are allocated in their own untracked mmapped regions, separate from
  others in this mspace. By default large chunks are not tracked,
  which reduces fragmentation. However, such chunks are not
  necessarily released to the system upon destroy_mspace.  Enabling
  tracking by setting to true may increase fragmentation, but avoids
  leakage when relying on destroy_mspace to release all memory
  allocated using this space.  The function returns the previous
  setting.
*/
DLMALLOC_EXPORT int mspace_track_large_chunks(mspace msp, int enable);


/*
  mspace_malloc behaves as malloc, but operates within
  the given space.
*/
DLMALLOC_EXPORT void* mspace_malloc(mspace msp, size_t bytes);

/*
  mspace_free behaves as free, but operates within
  the given space.

  If compiled with FOOTERS==1, mspace_free is not actually needed.
  free may be called instead of mspace_free because freed chunks from
  any space are handled by their originating spaces.
*/
DLMALLOC_EXPORT void mspace_free(mspace msp, void* mem);

/*
  mspace_realloc behaves as realloc, but operates within
  the given space.

  If compiled with FOOTERS==1, mspace_realloc is not actually
  needed.  realloc may be called instead of mspace_realloc because
  realloced chunks from any space are handled by their originating
  spaces.
*/
DLMALLOC_EXPORT void* mspace_realloc(mspace msp, void* mem, size_t newsize);

/*
  mspace_calloc behaves as calloc, but operates within
  the given space.
*/
DLMALLOC_EXPORT void* mspace_calloc(mspace msp, size_t n_elements, size_t elem_size);

/*
  mspace_memalign behaves as memalign, but operates within
  the given space.
*/
DLMALLOC_EXPORT void* mspace_memalign(mspace msp, size_t alignment, size_t bytes);

/*
  mspace_independent_calloc behaves as independent_calloc, but
  operates within the given space.
*/
DLMALLOC_EXPORT void** mspace_independent_calloc(mspace msp, size_t n_elements,
                                 size_t elem_size, void* chunks[]);

/*
  mspace_independent_comalloc behaves as independent_comalloc, but
  operates within the given space.
*/
DLMALLOC_EXPORT void** mspace_independent_comalloc(mspace msp, size_t n_elements,
                                   size_t sizes[], void* chunks[]);

/*
  mspace_footprint() returns the number of bytes obtained from the
  system for this space.
*/
DLMALLOC_EXPORT size_t mspace_footprint(mspace msp);

/*
  mspace_max_footprint() returns the peak number of bytes obtained from the
  system for this space.
*/
DLMALLOC_EXPORT size_t mspace_max_footprint(mspace msp);


#if !NO_MALLINFO
/*
  mspace_mallinfo behaves as mallinfo, but reports properties of
  the given space.
*/
DLMALLOC_EXPORT struct mallinfo mspace_mallinfo(mspace msp);
#endif /* NO_MALLINFO */

/*
  malloc_usable_size(void* p) behaves the same as malloc_usable_size;
*/
DLMALLOC_EXPORT size_t mspace_usable_size(const void* mem);

/*
  mspace_malloc_stats behaves as malloc_stats, but reports
  properties of the given space.
*/
DLMALLOC_EXPORT void mspace_malloc_stats(mspace msp);

/*
  mspace_trim behaves as malloc_trim, but
  operates within the given space.
*/
DLMALLOC_EXPORT int mspace_trim(mspace msp, size_t pad);

/*
  An alias for mallopt.
*/
DLMALLOC_EXPORT int mspace_mallopt(int, int);

#endif /* MSPACES */

#ifdef __cplusplus
}  /* end of extern "C" */
#endif /* __cplusplus */

/*
  ========================================================================
  To make a fully customizable malloc.h header file, cut everything
  above this line, put into file malloc.h, edit to suit, and #include it
  on the next line, as well as in programs that use this malloc.
  ========================================================================
*/

/* #include "malloc.h" */

/*------------------------------ internal #includes ---------------------- */

#ifdef _MSC_VER
#pragma warning( disable : 4146 ) /* no "unsigned" warnings */
#endif /* _MSC_VER */
#if !NO_MALLOC_STATS
#include <stdio.h>       /* for printing in malloc_stats */
#endif /* NO_MALLOC_STATS */
#ifndef LACKS_ERRNO_H
#include <errno.h>       /* for MALLOC_FAILURE_ACTION */
#endif /* LACKS_ERRNO_H */
#ifdef DEBUG
#if ABORT_ON_ASSERT_FAILURE
#undef assert
#define assert(x) if(!(x)) ABORT
#else /* ABORT_ON_ASSERT_FAILURE */
#include <assert.h>
#endif /* ABORT_ON_ASSERT_FAILURE */
#else  /* DEBUG */
#ifndef assert
#define assert(x)
#endif
#define DEBUG 0
#endif /* DEBUG */
#if !defined(WIN32) && !defined(LACKS_TIME_H)
#include <time.h>        /* for magic initialization */
#endif /* WIN32 */
#ifndef LACKS_STDLIB_H
#include <stdlib.h>      /* for abort() */
#endif /* LACKS_STDLIB_H */
#ifndef LACKS_STRING_H
#include <string.h>      /* for memset etc */
#endif  /* LACKS_STRING_H */
#if USE_BUILTIN_FFS
#ifndef LACKS_STRINGS_H
#include <strings.h>     /* for ffs */
#endif /* LACKS_STRINGS_H */
#endif /* USE_BUILTIN_FFS */
#if HAVE_MMAP
#ifndef LACKS_SYS_MMAN_H
/* On some versions of linux, mremap decl in mman.h needs __USE_GNU set */
#if (defined(linux) && !defined(__USE_GNU))
#define __USE_GNU 1
#include <sys/mman.h>    /* for mmap */
#undef __USE_GNU
#else
#include <sys/mman.h>    /* for mmap */
#endif /* linux */
#endif /* LACKS_SYS_MMAN_H */
#ifndef LACKS_FCNTL_H
#include <fcntl.h>
#endif /* LACKS_FCNTL_H */
#endif /* HAVE_MMAP */
#ifndef LACKS_UNISTD_H
#include <unistd.h>     /* for sbrk, sysconf */
#else /* LACKS_UNISTD_H */
#if !defined(__FreeBSD__) && !defined(__OpenBSD__) && !defined(__NetBSD__)
extern void*     sbrk(ptrdiff_t);
#endif /* FreeBSD etc */
#endif /* LACKS_UNISTD_H */

/* Declarations for locking */
#if USE_LOCKS
#ifndef WIN32
#if defined (__SVR4) && defined (__sun)  /* solaris */
#include <thread.h>
#elif !defined(LACKS_SCHED_H)
#include <sched.h>
#endif /* solaris or LACKS_SCHED_H */
#if (defined(USE_RECURSIVE_LOCKS) && USE_RECURSIVE_LOCKS != 0) || !USE_SPIN_LOCKS
#include <pthread.h>
#endif /* USE_RECURSIVE_LOCKS ... */
#elif defined(_MSC_VER)
#ifndef _M_AMD64
/* These are already defined on AMD64 builds */
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
LONG __cdecl _InterlockedCompareExchange(LONG volatile *Dest, LONG Exchange, LONG Comp);
LONG __cdecl _InterlockedExchange(LONG volatile *Target, LONG Value);
#ifdef __cplusplus
}
#endif /* __cplusplus */
#endif /* _M_AMD64 */
#pragma intrinsic (_InterlockedCompareExchange)
#pragma intrinsic (_InterlockedExchange)
#define interlockedcompareexchange _InterlockedCompareExchange
#define interlockedexchange _InterlockedExchange
#elif defined(WIN32) && defined(__GNUC__)
#define interlockedcompareexchange(a, b, c) __sync_val_compare_and_swap(a, c, b)
#define interlockedexchange __sync_lock_test_and_set
#endif /* Win32 */
#else /* USE_LOCKS */
#endif /* USE_LOCKS */

#ifndef LOCK_AT_FORK
#define LOCK_AT_FORK 0
#endif

/* Declarations for bit scanning on win32 */
#if defined(_MSC_VER) && _MSC_VER>=1300
#ifndef BitScanForward /* Try to avoid pulling in WinNT.h */
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
unsigned char _BitScanForward(unsigned long *index, unsigned long mask);
unsigned char _BitScanReverse(unsigned long *index, unsigned long mask);
#ifdef __cplusplus
}
#endif /* __cplusplus */

#define BitScanForward _BitScanForward
#define BitScanReverse _BitScanReverse
#pragma intrinsic(_BitScanForward)
#pragma intrinsic(_BitScanReverse)
#endif /* BitScanForward */
#endif /* defined(_MSC_VER) && _MSC_VER>=1300 */

#ifndef WIN32
#ifndef malloc_getpagesize
#  ifdef _SC_PAGESIZE         /* some SVR4 systems omit an underscore */
#    ifndef _SC_PAGE_SIZE
#      define _SC_PAGE_SIZE _SC_PAGESIZE
#    endif
#  endif
#  ifdef _SC_PAGE_SIZE
#    define malloc_getpagesize sysconf(_SC_PAGE_SIZE)
#  else
#    if defined(BSD) || defined(DGUX) || defined(HAVE_GETPAGESIZE)
       extern size_t getpagesize();
#      define malloc_getpagesize getpagesize()
#    else
#      ifdef WIN32 /* use supplied emulation of getpagesize */
#        define malloc_getpagesize getpagesize()
#      else
#        ifndef LACKS_SYS_PARAM_H
#          include <sys/param.h>
#        endif
#        ifdef EXEC_PAGESIZE
#          define malloc_getpagesize EXEC_PAGESIZE
#        else
#          ifdef NBPG
#            ifndef CLSIZE
#              define malloc_getpagesize NBPG
#            else
#              define malloc_getpagesize (NBPG * CLSIZE)
#            endif
#          else
#            ifdef NBPC
#              define malloc_getpagesize NBPC
#            else
#              ifdef PAGESIZE
#                define malloc_getpagesize PAGESIZE
#              else /* just guess */
#                define malloc_getpagesize ((size_t)4096U)
#              endif
#            endif
#          endif
#        endif
#      endif
#    endif
#  endif
#endif
#endif

/* ------------------- size_t and alignment properties -------------------- */
namespace romuluslog{
/* The byte and bit size of a size_t */
#define SIZE_T_SIZE         (sizeof(size_t))
#define SIZE_T_BITSIZE      (sizeof(size_t) << 3)

/* Some constants coerced to size_t */
/* Annoying but necessary to avoid errors on some platforms */
#define SIZE_T_ZERO         ((size_t)0)
#define SIZE_T_ONE          ((size_t)1)
#define SIZE_T_TWO          ((size_t)2)
#define SIZE_T_FOUR         ((size_t)4)
#define TWO_SIZE_T_SIZES    (SIZE_T_SIZE<<1)
#define FOUR_SIZE_T_SIZES   (SIZE_T_SIZE<<2)
#define SIX_SIZE_T_SIZES    (FOUR_SIZE_T_SIZES+TWO_SIZE_T_SIZES)
#define HALF_MAX_SIZE_T     (MAX_SIZE_T / 2U)

/* The bit mask value corresponding to MALLOC_ALIGNMENT */
#define CHUNK_ALIGN_MASK    (MALLOC_ALIGNMENT - SIZE_T_ONE)

/* True if address a has acceptable alignment */
#define is_aligned(A)       (((size_t)((A)) & (CHUNK_ALIGN_MASK)) == 0)

/* the number of bytes to offset an address to align it */
#define align_offset(A)\
 ((((size_t)(A) & CHUNK_ALIGN_MASK) == 0)? 0 :\
  ((MALLOC_ALIGNMENT - ((size_t)(A) & CHUNK_ALIGN_MASK)) & CHUNK_ALIGN_MASK))

/* -------------------------- MMAP preliminaries ------------------------- */

/*
   If HAVE_MORECORE or HAVE_MMAP are false, we just define calls and
   checks to fail so compiler optimizer can delete code rather than
   using so many "#if"s.
*/


/* MORECORE and MMAP must return MFAIL on failure */
#define MFAIL                ((void*)(MAX_SIZE_T))
#define CMFAIL               ((char*)(MFAIL)) /* defined for convenience */

#if HAVE_MMAP

#ifndef WIN32
#define MUNMAP_DEFAULT(a, s)  munmap((a), (s))
#define MMAP_PROT            (PROT_READ|PROT_WRITE)
#if !defined(MAP_ANONYMOUS) && defined(MAP_ANON)
#define MAP_ANONYMOUS        MAP_ANON
#endif /* MAP_ANON */
#ifdef MAP_ANONYMOUS
#define MMAP_FLAGS           (MAP_PRIVATE|MAP_ANONYMOUS)
#define MMAP_DEFAULT(s)       mmap(0, (s), MMAP_PROT, MMAP_FLAGS, -1, 0)
#else /* MAP_ANONYMOUS */
/*
   Nearly all versions of mmap support MAP_ANONYMOUS, so the following
   is unlikely to be needed, but is supplied just in case.
*/
#define MMAP_FLAGS           (MAP_PRIVATE)
static int dev_zero_fd = -1; /* Cached file descriptor for /dev/zero. */
#define MMAP_DEFAULT(s) ((dev_zero_fd < 0) ? \
           (dev_zero_fd = open("/dev/zero", O_RDWR), \
            mmap(0, (s), MMAP_PROT, MMAP_FLAGS, dev_zero_fd, 0)) : \
            mmap(0, (s), MMAP_PROT, MMAP_FLAGS, dev_zero_fd, 0))
#endif /* MAP_ANONYMOUS */

#define DIRECT_MMAP_DEFAULT(s) MMAP_DEFAULT(s)

#else /* WIN32 */

/* Win32 MMAP via VirtualAlloc */
static FORCEINLINE void* win32mmap(size_t size) {
  void* ptr = VirtualAlloc(0, size, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE);
  return (ptr != 0)? ptr: MFAIL;
}

/* For direct MMAP, use MEM_TOP_DOWN to minimize interference */
static FORCEINLINE void* win32direct_mmap(size_t size) {
  void* ptr = VirtualAlloc(0, size, MEM_RESERVE|MEM_COMMIT|MEM_TOP_DOWN,
                           PAGE_READWRITE);
  return (ptr != 0)? ptr: MFAIL;
}

/* This function supports releasing coalesed segments */
static FORCEINLINE int win32munmap(void* ptr, size_t size) {
  MEMORY_BASIC_INFORMATION minfo;
  char* cptr = (char*)ptr;
  while (size) {
    if (VirtualQuery(cptr, &minfo, sizeof(minfo)) == 0)
      return -1;
    if (minfo.BaseAddress != cptr || minfo.AllocationBase != cptr ||
        minfo.State != MEM_COMMIT || minfo.RegionSize > size)
      return -1;
    if (VirtualFree(cptr, 0, MEM_RELEASE) == 0)
      return -1;
    cptr += minfo.RegionSize;
    size -= minfo.RegionSize;
  }
  return 0;
}

#define MMAP_DEFAULT(s)             win32mmap(s)
#define MUNMAP_DEFAULT(a, s)        win32munmap((a), (s))
#define DIRECT_MMAP_DEFAULT(s)      win32direct_mmap(s)
#endif /* WIN32 */
#endif /* HAVE_MMAP */

#if HAVE_MREMAP
#ifndef WIN32
#define MREMAP_DEFAULT(addr, osz, nsz, mv) mremap((addr), (osz), (nsz), (mv))
#endif /* WIN32 */
#endif /* HAVE_MREMAP */

/**
 * Define CALL_MORECORE
 */
#if HAVE_MORECORE
    #ifdef MORECORE
        #define CALL_MORECORE(S)    MORECORE(S)
    #else  /* MORECORE */
        #define CALL_MORECORE(S)    MORECORE_DEFAULT(S)
    #endif /* MORECORE */
#else  /* HAVE_MORECORE */
    #define CALL_MORECORE(S)        MFAIL
#endif /* HAVE_MORECORE */

/**
 * Define CALL_MMAP/CALL_MUNMAP/CALL_DIRECT_MMAP
 */
#if HAVE_MMAP
    #define USE_MMAP_BIT            (SIZE_T_ONE)

    #ifdef MMAP
        #define CALL_MMAP(s)        MMAP(s)
    #else /* MMAP */
        #define CALL_MMAP(s)        MMAP_DEFAULT(s)
    #endif /* MMAP */
    #ifdef MUNMAP
        #define CALL_MUNMAP(a, s)   MUNMAP((a), (s))
    #else /* MUNMAP */
        #define CALL_MUNMAP(a, s)   MUNMAP_DEFAULT((a), (s))
    #endif /* MUNMAP */
    #ifdef DIRECT_MMAP
        #define CALL_DIRECT_MMAP(s) DIRECT_MMAP(s)
    #else /* DIRECT_MMAP */
        #define CALL_DIRECT_MMAP(s) DIRECT_MMAP_DEFAULT(s)
    #endif /* DIRECT_MMAP */
#else  /* HAVE_MMAP */
    #define USE_MMAP_BIT            (SIZE_T_ZERO)

    #define MMAP(s)                 MFAIL
    #define MUNMAP(a, s)            (-1)
    #define DIRECT_MMAP(s)          MFAIL
    #define CALL_DIRECT_MMAP(s)     DIRECT_MMAP(s)
    #define CALL_MMAP(s)            MMAP(s)
    #define CALL_MUNMAP(a, s)       MUNMAP((a), (s))
#endif /* HAVE_MMAP */

/**
 * Define CALL_MREMAP
 */
#if HAVE_MMAP && HAVE_MREMAP
    #ifdef MREMAP
        #define CALL_MREMAP(addr, osz, nsz, mv) MREMAP((addr), (osz), (nsz), (mv))
    #else /* MREMAP */
        #define CALL_MREMAP(addr, osz, nsz, mv) MREMAP_DEFAULT((addr), (osz), (nsz), (mv))
    #endif /* MREMAP */
#else  /* HAVE_MMAP && HAVE_MREMAP */
    #define CALL_MREMAP(addr, osz, nsz, mv)     MFAIL
#endif /* HAVE_MMAP && HAVE_MREMAP */

/* mstate bit set if continguous morecore disabled or failed */
#define USE_NONCONTIGUOUS_BIT (4U)

/* segment bit set in create_mspace_with_base */
#define EXTERN_BIT            (8U)


/* --------------------------- Lock preliminaries ------------------------ */

/*
  When locks are defined, there is one global lock, plus
  one per-mspace lock.

  The global lock_ensures that mparams.magic and other unique
  mparams values are initialized only once. It also protects
  sequences of calls to MORECORE.  In many cases sys_alloc requires
  two calls, that should not be interleaved with calls by other
  threads.  This does not protect against direct calls to MORECORE
  by other threads not using this lock, so there is still code to
  cope the best we can on interference.

  Per-mspace locks surround calls to malloc, free, etc.
  By default, locks are simple non-reentrant mutexes.

  Because lock-protected regions generally have bounded times, it is
  OK to use the supplied simple spinlocks. Spinlocks are likely to
  improve performance for lightly contended applications, but worsen
  performance under heavy contention.

  If USE_LOCKS is > 1, the definitions of lock routines here are
  bypassed, in which case you will need to define the type MLOCK_T,
  and at least INITIAL_LOCK, DESTROY_LOCK, ACQUIRE_LOCK, RELEASE_LOCK
  and TRY_LOCK.  You must also declare a
    static MLOCK_T malloc_global_mutex = { initialization values };.

*/

#if !USE_LOCKS
#define USE_LOCK_BIT               (0U)
#define INITIAL_LOCK(l)            (0)
#define DESTROY_LOCK(l)            (0)
#define ACQUIRE_MALLOC_GLOBAL_LOCK()
#define RELEASE_MALLOC_GLOBAL_LOCK()

#else
#if USE_LOCKS > 1
/* -----------------------  User-defined locks ------------------------ */
/* Define your own lock implementation here */
/* #define INITIAL_LOCK(lk)  ... */
/* #define DESTROY_LOCK(lk)  ... */
/* #define ACQUIRE_LOCK(lk)  ... */
/* #define RELEASE_LOCK(lk)  ... */
/* #define TRY_LOCK(lk) ... */
/* static MLOCK_T malloc_global_mutex = ... */

#elif USE_SPIN_LOCKS

/* First, define CAS_LOCK and CLEAR_LOCK on ints */
/* Note CAS_LOCK defined to return 0 on success */

#if defined(__GNUC__)&& (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1))
#define CAS_LOCK(sl)     __sync_lock_test_and_set(sl, 1)
#define CLEAR_LOCK(sl)   __sync_lock_release(sl)

#elif (defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)))
/* Custom spin locks for older gcc on x86 */
static FORCEINLINE int x86_cas_lock(int *sl) {
  int ret;
  int val = 1;
  int cmp = 0;
  __asm__ __volatile__  ("lock; cmpxchgl %1, %2"
                         : "=a" (ret)
                         : "r" (val), "m" (*(sl)), "0"(cmp)
                         : "memory", "cc");
  return ret;
}

static FORCEINLINE void x86_clear_lock(int* sl) {
  assert(*sl != 0);
  int prev = 0;
  int ret;
  __asm__ __volatile__ ("lock; xchgl %0, %1"
                        : "=r" (ret)
                        : "m" (*(sl)), "0"(prev)
                        : "memory");
}

#define CAS_LOCK(sl)     x86_cas_lock(sl)
#define CLEAR_LOCK(sl)   x86_clear_lock(sl)

#else /* Win32 MSC */
#define CAS_LOCK(sl)     interlockedexchange(sl, (LONG)1)
#define CLEAR_LOCK(sl)   interlockedexchange (sl, (LONG)0)

#endif /* ... gcc spins locks ... */

/* How to yield for a spin lock */
#define SPINS_PER_YIELD       63
#if defined(_MSC_VER)
#define SLEEP_EX_DURATION     50 /* delay for yield/sleep */
#define SPIN_LOCK_YIELD  SleepEx(SLEEP_EX_DURATION, FALSE)
#elif defined (__SVR4) && defined (__sun) /* solaris */
#define SPIN_LOCK_YIELD   thr_yield();
#elif !defined(LACKS_SCHED_H)
#define SPIN_LOCK_YIELD   sched_yield();
#else
#define SPIN_LOCK_YIELD
#endif /* ... yield ... */

#if !defined(USE_RECURSIVE_LOCKS) || USE_RECURSIVE_LOCKS == 0
/* Plain spin locks use single word (embedded in malloc_states) */
static int spin_acquire_lock(int *sl) {
  int spins = 0;
  while (*(volatile int *)sl != 0 || CAS_LOCK(sl)) {
    if ((++spins & SPINS_PER_YIELD) == 0) {
      SPIN_LOCK_YIELD;
    }
  }
  return 0;
}

#define MLOCK_T               int
#define TRY_LOCK(sl)          !CAS_LOCK(sl)
#define RELEASE_LOCK(sl)      CLEAR_LOCK(sl)
#define ACQUIRE_LOCK(sl)      (CAS_LOCK(sl)? spin_acquire_lock(sl) : 0)
#define INITIAL_LOCK(sl)      (*sl = 0)
#define DESTROY_LOCK(sl)      (0)
static MLOCK_T malloc_global_mutex = 0;

#else /* USE_RECURSIVE_LOCKS */
/* types for lock owners */
#ifdef WIN32
#define THREAD_ID_T           DWORD
#define CURRENT_THREAD        GetCurrentThreadId()
#define EQ_OWNER(X,Y)         ((X) == (Y))
#else
/*
  Note: the following assume that pthread_t is a type that can be
  initialized to (casted) zero. If this is not the case, you will need to
  somehow redefine these or not use spin locks.
*/
#define THREAD_ID_T           pthread_t
#define CURRENT_THREAD        pthread_self()
#define EQ_OWNER(X,Y)         pthread_equal(X, Y)
#endif

struct malloc_recursive_lock {
  int sl;
  unsigned int c;
  THREAD_ID_T threadid;
};

#define MLOCK_T  struct malloc_recursive_lock
static MLOCK_T malloc_global_mutex = { 0, 0, (THREAD_ID_T)0};

static FORCEINLINE void recursive_release_lock(MLOCK_T *lk) {
  assert(lk->sl != 0);
  if (--lk->c == 0) {
    CLEAR_LOCK(&lk->sl);
  }
}

static FORCEINLINE int recursive_acquire_lock(MLOCK_T *lk) {
  THREAD_ID_T mythreadid = CURRENT_THREAD;
  int spins = 0;
  for (;;) {
    if (*((volatile int *)(&lk->sl)) == 0) {
      if (!CAS_LOCK(&lk->sl)) {
        lk->threadid = mythreadid;
        lk->c = 1;
        return 0;
      }
    }
    else if (EQ_OWNER(lk->threadid, mythreadid)) {
      ++lk->c;
      return 0;
    }
    if ((++spins & SPINS_PER_YIELD) == 0) {
      SPIN_LOCK_YIELD;
    }
  }
}

static FORCEINLINE int recursive_try_lock(MLOCK_T *lk) {
  THREAD_ID_T mythreadid = CURRENT_THREAD;
  if (*((volatile int *)(&lk->sl)) == 0) {
    if (!CAS_LOCK(&lk->sl)) {
      lk->threadid = mythreadid;
      lk->c = 1;
      return 1;
    }
  }
  else if (EQ_OWNER(lk->threadid, mythreadid)) {
    ++lk->c;
    return 1;
  }
  return 0;
}

#define RELEASE_LOCK(lk)      recursive_release_lock(lk)
#define TRY_LOCK(lk)          recursive_try_lock(lk)
#define ACQUIRE_LOCK(lk)      recursive_acquire_lock(lk)
#define INITIAL_LOCK(lk)      ((lk)->threadid = (THREAD_ID_T)0, (lk)->sl = 0, (lk)->c = 0)
#define DESTROY_LOCK(lk)      (0)
#endif /* USE_RECURSIVE_LOCKS */

#elif defined(WIN32) /* Win32 critical sections */
#define MLOCK_T               CRITICAL_SECTION
#define ACQUIRE_LOCK(lk)      (EnterCriticalSection(lk), 0)
#define RELEASE_LOCK(lk)      LeaveCriticalSection(lk)
#define TRY_LOCK(lk)          TryEnterCriticalSection(lk)
#define INITIAL_LOCK(lk)      (!InitializeCriticalSectionAndSpinCount((lk), 0x80000000|4000))
#define DESTROY_LOCK(lk)      (DeleteCriticalSection(lk), 0)
#define NEED_GLOBAL_LOCK_INIT

static MLOCK_T malloc_global_mutex;
static volatile LONG malloc_global_mutex_status;

/* Use spin loop to initialize global lock */
static void init_malloc_global_mutex() {
  for (;;) {
    long stat = malloc_global_mutex_status;
    if (stat > 0)
      return;
    /* transition to < 0 while initializing, then to > 0) */
    if (stat == 0 &&
        interlockedcompareexchange(&malloc_global_mutex_status, (LONG)-1, (LONG)0) == 0) {
      InitializeCriticalSection(&malloc_global_mutex);
      interlockedexchange(&malloc_global_mutex_status, (LONG)1);
      return;
    }
    SleepEx(0, FALSE);
  }
}

#else /* pthreads-based locks */
#define MLOCK_T               pthread_mutex_t
#define ACQUIRE_LOCK(lk)      pthread_mutex_lock(lk)
#define RELEASE_LOCK(lk)      pthread_mutex_unlock(lk)
#define TRY_LOCK(lk)          (!pthread_mutex_trylock(lk))
#define INITIAL_LOCK(lk)      pthread_init_lock(lk)
#define DESTROY_LOCK(lk)      pthread_mutex_destroy(lk)

#if defined(USE_RECURSIVE_LOCKS) && USE_RECURSIVE_LOCKS != 0 && defined(linux) && !defined(PTHREAD_MUTEX_RECURSIVE)
/* Cope with old-style linux recursive lock initialization by adding */
/* skipped internal declaration from pthread.h */
extern int pthread_mutexattr_setkind_np __P ((pthread_mutexattr_t *__attr,
                                              int __kind));
#define PTHREAD_MUTEX_RECURSIVE PTHREAD_MUTEX_RECURSIVE_NP
#define pthread_mutexattr_settype(x,y) pthread_mutexattr_setkind_np(x,y)
#endif /* USE_RECURSIVE_LOCKS ... */

static MLOCK_T malloc_global_mutex = PTHREAD_MUTEX_INITIALIZER;

static int pthread_init_lock (MLOCK_T *lk) {
  pthread_mutexattr_t attr;
  if (pthread_mutexattr_init(&attr)) return 1;
#if defined(USE_RECURSIVE_LOCKS) && USE_RECURSIVE_LOCKS != 0
  if (pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE)) return 1;
#endif
  if (pthread_mutex_init(lk, &attr)) return 1;
  if (pthread_mutexattr_destroy(&attr)) return 1;
  return 0;
}

#endif /* ... lock types ... */

/* Common code for all lock types */
#define USE_LOCK_BIT               (2U)

#ifndef ACQUIRE_MALLOC_GLOBAL_LOCK
#define ACQUIRE_MALLOC_GLOBAL_LOCK()  ACQUIRE_LOCK(&malloc_global_mutex);
#endif

#ifndef RELEASE_MALLOC_GLOBAL_LOCK
#define RELEASE_MALLOC_GLOBAL_LOCK()  RELEASE_LOCK(&malloc_global_mutex);
#endif

#endif /* USE_LOCKS */

/* -----------------------  Chunk representations ------------------------ */

/*
  (The following includes lightly edited explanations by Colin Plumb.)

  The malloc_chunk declaration below is misleading (but accurate and
  necessary).  It declares a "view" into memory allowing access to
  necessary fields at known offsets from a given base.

  Chunks of memory are maintained using a `boundary tag' method as
  originally described by Knuth.  (See the paper by Paul Wilson
  ftp://ftp.cs.utexas.edu/pub/garbage/allocsrv.ps for a survey of such
  techniques.)  Sizes of free chunks are stored both in the front of
  each chunk and at the end.  This makes consolidating fragmented
  chunks into bigger chunks fast.  The head fields also hold bits
  representing whether chunks are free or in use.

  Here are some pictures to make it clearer.  They are "exploded" to
  show that the state of a chunk can be thought of as extending from
  the high 31 bits of the head field of its header through the
  prev_foot and PINUSE_BIT bit of the following chunk header.

  A chunk that's in use looks like:

   chunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
           | Size of previous chunk (if P = 0)                             |
           +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
         +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |P|
         | Size of this chunk                                         1| +-+
   mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
         |                                                               |
         +-                                                             -+
         |                                                               |
         +-                                                             -+
         |                                                               :
         +-      size - sizeof(size_t) available payload bytes          -+
         :                                                               |
 chunk-> +-                                                             -+
         |                                                               |
         +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |1|
       | Size of next chunk (may or may not be in use)               | +-+
 mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+

    And if it's free, it looks like this:

   chunk-> +-                                                             -+
           | User payload (must be in use, or we would have merged!)       |
           +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
         +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |P|
         | Size of this chunk                                         0| +-+
   mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
         | Next pointer                                                  |
         +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
         | Prev pointer                                                  |
         +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
         |                                                               :
         +-      size - sizeof(struct chunk) unused bytes               -+
         :                                                               |
 chunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
         | Size of this chunk                                            |
         +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |0|
       | Size of next chunk (must be in use, or we would have merged)| +-+
 mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
       |                                                               :
       +- User payload                                                -+
       :                                                               |
       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
                                                                     |0|
                                                                     +-+
  Note that since we always merge adjacent free chunks, the chunks
  adjacent to a free chunk must be in use.

  Given a pointer to a chunk (which can be derived trivially from the
  payload pointer) we can, in O(1) time, find out whether the adjacent
  chunks are free, and if so, unlink them from the lists that they
  are on and merge them with the current chunk.

  Chunks always begin on even word boundaries, so the mem portion
  (which is returned to the user) is also on an even word boundary, and
  thus at least double-word aligned.

  The P (PINUSE_BIT) bit, stored in the unused low-order bit of the
  chunk size (which is always a multiple of two words), is an in-use
  bit for the *previous* chunk.  If that bit is *clear*, then the
  word before the current chunk size contains the previous chunk
  size, and can be used to find the front of the previous chunk.
  The very first chunk allocated always has this bit set, preventing
  access to non-existent (or non-owned) memory. If pinuse is set for
  any given chunk, then you CANNOT determine the size of the
  previous chunk, and might even get a memory addressing fault when
  trying to do so.

  The C (CINUSE_BIT) bit, stored in the unused second-lowest bit of
  the chunk size redundantly records whether the current chunk is
  inuse (unless the chunk is mmapped). This redundancy enables usage
  checks within free and realloc, and reduces indirection when freeing
  and consolidating chunks.

  Each freshly allocated chunk must have both cinuse and pinuse set.
  That is, each allocated chunk borders either a previously allocated
  and still in-use chunk, or the base of its memory arena. This is
  ensured by making all allocations from the `lowest' part of any
  found chunk.  Further, no free chunk physically borders another one,
  so each free chunk is known to be preceded and followed by either
  inuse chunks or the ends of memory.

  Note that the `foot' of the current chunk is actually represented
  as the prev_foot of the NEXT chunk. This makes it easier to
  deal with alignments etc but can be very confusing when trying
  to extend or adapt this code.

  The exceptions to all this are

     1. The special chunk `top' is the top-most available chunk (i.e.,
        the one bordering the end of available memory). It is treated
        specially.  Top is never included in any bin, is used only if
        no other chunk is available, and is released back to the
        system if it is very large (see M_TRIM_THRESHOLD).  In effect,
        the top chunk is treated as larger (and thus less well
        fitting) than any other available chunk.  The top chunk
        doesn't update its trailing size field since there is no next
        contiguous chunk that would have to index off it. However,
        space is still allocated for it (TOP_FOOT_SIZE) to enable
        separation or merging when space is extended.

     3. Chunks allocated via mmap, have both cinuse and pinuse bits
        cleared in their head fields.  Because they are allocated
        one-by-one, each must carry its own prev_foot field, which is
        also used to hold the offset this chunk has within its mmapped
        region, which is needed to preserve alignment. Each mmapped
        chunk is trailed by the first two fields of a fake next-chunk
        for sake of usage checks.

*/

struct malloc_chunk {
  persist<size_t>               prev_foot;  /* Size of previous chunk (if free).  */
  persist<size_t>               head;       /* Size and inuse bits. */
  persist<struct malloc_chunk*> fd;         /* double links -- used only if free. */
  persist<struct malloc_chunk*> bk;
};

typedef struct malloc_chunk  mchunk;
typedef struct malloc_chunk* mchunkptr;
typedef struct malloc_chunk* sbinptr;  /* The type of bins of chunks */
typedef unsigned int bindex_t;         /* Described below */
typedef unsigned int binmap_t;         /* Described below */
typedef unsigned int flag_t;           /* The type of various bit flag sets */

/* ------------------- Chunks sizes and alignments ----------------------- */

#define MCHUNK_SIZE         (sizeof(mchunk))

#if FOOTERS
#define CHUNK_OVERHEAD      (TWO_SIZE_T_SIZES)
#else /* FOOTERS */
#define CHUNK_OVERHEAD      (SIZE_T_SIZE)
#endif /* FOOTERS */

/* MMapped chunks need a second word of overhead ... */
#define MMAP_CHUNK_OVERHEAD (TWO_SIZE_T_SIZES)
/* ... and additional padding for fake next-chunk at foot */
#define MMAP_FOOT_PAD       (FOUR_SIZE_T_SIZES)

/* The smallest size we can malloc is an aligned minimal chunk */
#define MIN_CHUNK_SIZE\
  ((MCHUNK_SIZE + CHUNK_ALIGN_MASK) & ~CHUNK_ALIGN_MASK)

/* conversion from malloc headers to user pointers, and back */
#define chunk2mem(p)        ((void*)((char*)(p)       + TWO_SIZE_T_SIZES))
#define mem2chunk(mem)      ((mchunkptr)((char*)(mem) - TWO_SIZE_T_SIZES))
/* chunk associated with aligned address A */
#define align_as_chunk(A)   (mchunkptr)((A) + align_offset(chunk2mem(A)))

/* Bounds on request (not chunk) sizes. */
#define MAX_REQUEST         ((-MIN_CHUNK_SIZE) << 2)
#define MIN_REQUEST         (MIN_CHUNK_SIZE - CHUNK_OVERHEAD - SIZE_T_ONE)

/* pad request bytes into a usable size */
#define pad_request(req) \
   (((req) + CHUNK_OVERHEAD + CHUNK_ALIGN_MASK) & ~CHUNK_ALIGN_MASK)

/* pad request, checking for minimum (but not maximum) */
#define request2size(req) \
  (((req) < MIN_REQUEST)? MIN_CHUNK_SIZE : pad_request(req))


/* ------------------ Operations on head and foot fields ----------------- */

/*
  The head field of a chunk is or'ed with PINUSE_BIT when previous
  adjacent chunk in use, and or'ed with CINUSE_BIT if this chunk is in
  use, unless mmapped, in which case both bits are cleared.

  FLAG4_BIT is not used by this malloc, but might be useful in extensions.
*/

#define PINUSE_BIT          (SIZE_T_ONE)
#define CINUSE_BIT          (SIZE_T_TWO)
#define FLAG4_BIT           (SIZE_T_FOUR)
#define INUSE_BITS          (PINUSE_BIT|CINUSE_BIT)
#define FLAG_BITS           (PINUSE_BIT|CINUSE_BIT|FLAG4_BIT)

/* Head value for fenceposts */
#define FENCEPOST_HEAD      (INUSE_BITS|SIZE_T_SIZE)

/* extraction of fields from head words */
#define cinuse(p)           ((p)->head & CINUSE_BIT)
#define pinuse(p)           ((p)->head & PINUSE_BIT)
#define flag4inuse(p)       ((p)->head & FLAG4_BIT)
#define is_inuse(p)         (((p)->head & INUSE_BITS) != PINUSE_BIT)
#define is_mmapped(p)       (((p)->head & INUSE_BITS) == 0)

#define chunksize(p)        ((p)->head & ~(FLAG_BITS))

#define clear_pinuse(p)     ((p)->head &= ~PINUSE_BIT)
#define set_flag4(p)        ((p)->head |= FLAG4_BIT)
#define clear_flag4(p)      ((p)->head &= ~FLAG4_BIT)

/* Treat space at ptr +/- offset as a chunk */
#define chunk_plus_offset(p, s)  ((mchunkptr)(((char*)(p)) + (s)))
#define chunk_minus_offset(p, s) ((mchunkptr)(((char*)(p)) - (s)))

/* Ptr to next or previous physical malloc_chunk. */
#define next_chunk(p) ((mchunkptr)( ((char*)(p)) + ((p)->head & ~FLAG_BITS)))
#define prev_chunk(p) ((mchunkptr)( ((char*)(p)) - ((p)->prev_foot) ))

/* extract next chunk's pinuse bit */
#define next_pinuse(p)  ((next_chunk(p)->head) & PINUSE_BIT)

/* Get/set size at footer */
#define get_foot(p, s)  (((mchunkptr)((char*)(p) + (s)))->prev_foot)
#define set_foot(p, s)  (((mchunkptr)((char*)(p) + (s)))->prev_foot = (s))

/* Set size, pinuse bit, and foot */
#define set_size_and_pinuse_of_free_chunk(p, s)\
  ((p)->head = (s|PINUSE_BIT), set_foot(p, s))

/* Set size, pinuse bit, foot, and clear next pinuse */
#define set_free_with_pinuse(p, s, n)\
  (clear_pinuse(n), set_size_and_pinuse_of_free_chunk(p, s))

/* Get the internal overhead associated with chunk p */
#define overhead_for(p)\
 (is_mmapped(p)? MMAP_CHUNK_OVERHEAD : CHUNK_OVERHEAD)

/* Return true if malloced space is not necessarily cleared */
#if MMAP_CLEARS
#define calloc_must_clear(p) (!is_mmapped(p))
#else /* MMAP_CLEARS */
#define calloc_must_clear(p) (1)
#endif /* MMAP_CLEARS */

/* ---------------------- Overlaid data structures ----------------------- */

/*
  When chunks are not in use, they are treated as nodes of either
  lists or trees.

  "Small"  chunks are stored in circular doubly-linked lists, and look
  like this:

    chunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
            |             Size of previous chunk                            |
            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
    `head:' |             Size of chunk, in bytes                         |P|
      mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
            |             Forward pointer to next chunk in list             |
            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
            |             Back pointer to previous chunk in list            |
            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
            |             Unused space (may be 0 bytes long)                .
            .                                                               .
            .                                                               |
nextchunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
    `foot:' |             Size of chunk, in bytes                           |
            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+

  Larger chunks are kept in a form of bitwise digital trees (aka
  tries) keyed on chunksizes.  Because malloc_tree_chunks are only for
  free chunks greater than 256 bytes, their size doesn't impose any
  constraints on user chunk sizes.  Each node looks like:

    chunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
            |             Size of previous chunk                            |
            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
    `head:' |             Size of chunk, in bytes                         |P|
      mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
            |             Forward pointer to next chunk of same size        |
            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
            |             Back pointer to previous chunk of same size       |
            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
            |             Pointer to left child (child[0])                  |
            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
            |             Pointer to right child (child[1])                 |
            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
            |             Pointer to parent                                 |
            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
            |             bin index of this chunk                           |
            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
            |             Unused space                                      .
            .                                                               |
nextchunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
    `foot:' |             Size of chunk, in bytes                           |
            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+

  Each tree holding treenodes is a tree of unique chunk sizes.  Chunks
  of the same size are arranged in a circularly-linked list, with only
  the oldest chunk (the next to be used, in our FIFO ordering)
  actually in the tree.  (Tree members are distinguished by a non-null
  parent pointer.)  If a chunk with the same size an an existing node
  is inserted, it is linked off the existing node using pointers that
  work in the same way as fd/bk pointers of small chunks.

  Each tree contains a power of 2 sized range of chunk sizes (the
  smallest is 0x100 <= x < 0x180), which is is divided in half at each
  tree level, with the chunks in the smaller half of the range (0x100
  <= x < 0x140 for the top nose) in the left subtree and the larger
  half (0x140 <= x < 0x180) in the right subtree.  This is, of course,
  done by inspecting individual bits.

  Using these rules, each node's left subtree contains all smaller
  sizes than its right subtree.  However, the node at the root of each
  subtree has no particular ordering relationship to either.  (The
  dividing line between the subtree sizes is based on trie relation.)
  If we remove the last chunk of a given size from the interior of the
  tree, we need to replace it with a leaf node.  The tree ordering
  rules permit a node to be replaced by any leaf below it.

  The smallest chunk in a tree (a common operation in a best-fit
  allocator) can be found by walking a path to the leftmost leaf in
  the tree.  Unlike a usual binary tree, where we follow left child
  pointers until we reach a null, here we follow the right child
  pointer any time the left one is null, until we reach a leaf with
  both child pointers null. The smallest chunk in the tree will be
  somewhere along that path.

  The worst case number of steps to add, find, or remove a node is
  bounded by the number of bits differentiating chunks within
  bins. Under current bin calculations, this ranges from 6 up to 21
  (for 32 bit sizes) or up to 53 (for 64 bit sizes). The typical case
  is of course much better.
*/

struct malloc_tree_chunk {
  /* The first four fields must be compatible with malloc_chunk */
  persist<size_t>                    prev_foot;
  persist<size_t>                    head;
  persist<struct malloc_tree_chunk*> fd;
  persist<struct malloc_tree_chunk*> bk;

  persist<struct malloc_tree_chunk*> child[2];
  persist<struct malloc_tree_chunk*> parent;
  persist<bindex_t>                  index;
};

typedef struct malloc_tree_chunk  tchunk;
typedef struct malloc_tree_chunk* tchunkptr;
typedef struct malloc_tree_chunk* tbinptr; /* The type of bins of trees */

/* A little helper macro for trees */
#define leftmost_child(t) ((t)->child[0] != 0? (t)->child[0] : (t)->child[1])

/* ----------------------------- Segments -------------------------------- */

/*
  Each malloc space may include non-contiguous segments, held in a
  list headed by an embedded malloc_segment record representing the
  top-most space. Segments also include flags holding properties of
  the space. Large chunks that are directly allocated by mmap are not
  included in this list. They are instead independently created and
  destroyed without otherwise keeping track of them.

  Segment management mainly comes into play for spaces allocated by
  MMAP.  Any call to MMAP might or might not return memory that is
  adjacent to an existing segment.  MORECORE normally contiguously
  extends the current space, so this space is almost always adjacent,
  which is simpler and faster to deal with. (This is why MORECORE is
  used preferentially to MMAP when both are available -- see
  sys_alloc.)  When allocating using MMAP, we don't use any of the
  hinting mechanisms (inconsistently) supported in various
  implementations of unix mmap, or distinguish reserving from
  committing memory. Instead, we just ask for space, and exploit
  contiguity when we get it.  It is probably possible to do
  better than this on some systems, but no general scheme seems
  to be significantly better.

  Management entails a simpler variant of the consolidation scheme
  used for chunks to reduce fragmentation -- new adjacent memory is
  normally prepended or appended to an existing segment. However,
  there are limitations compared to chunk consolidation that mostly
  reflect the fact that segment processing is relatively infrequent
  (occurring only when getting memory from system) and that we
  don't expect to have huge numbers of segments:

  * Segments are not indexed, so traversal requires linear scans.  (It
    would be possible to index these, but is not worth the extra
    overhead and complexity for most programs on most platforms.)
  * New segments are only appended to old ones when holding top-most
    memory; if they cannot be prepended to others, they are held in
    different segments.

  Except for the top-most segment of an mstate, each segment record
  is kept at the tail of its segment. Segments are added by pushing
  segment records onto the list headed by &mstate.seg for the
  containing mstate.

  Segment flags control allocation/merge/deallocation policies:
  * If EXTERN_BIT set, then we did not allocate this segment,
    and so should not try to deallocate or merge with others.
    (This currently holds only for the initial segment passed
    into create_mspace_with_base.)
  * If USE_MMAP_BIT set, the segment may be merged with
    other surrounding mmapped segments and trimmed/de-allocated
    using munmap.
  * If neither bit is set, then the segment was obtained using
    MORECORE so can be merged with surrounding MORECORE'd segments
    and deallocated/trimmed using MORECORE with negative arguments.
*/

struct malloc_segment {
  persist<char*>       base;             /* base address */
  persist<size_t>       size;             /* allocated size */
  persist<struct malloc_segment*> next;   /* ptr to next segment */
  persist<flag_t>       sflags;           /* mmap and extern flag */
};

#define is_mmapped_segment(S)  ((S)->sflags & USE_MMAP_BIT)
#define is_extern_segment(S)   ((S)->sflags & EXTERN_BIT)

typedef struct malloc_segment  msegment;
typedef struct malloc_segment* msegmentptr;

/* ---------------------------- malloc_state ----------------------------- */

/*
   A malloc_state holds all of the bookkeeping for a space.
   The main fields are:

  Top
    The topmost chunk of the currently active segment. Its size is
    cached in topsize.  The actual size of topmost space is
    topsize+TOP_FOOT_SIZE, which includes space reserved for adding
    fenceposts and segment records if necessary when getting more
    space from the system.  The size at which to autotrim top is
    cached from mparams in trim_check, except that it is disabled if
    an autotrim fails.

  Designated victim (dv)
    This is the preferred chunk for servicing small requests that
    don't have exact fits.  It is normally the chunk split off most
    recently to service another small request.  Its size is cached in
    dvsize. The link fields of this chunk are not maintained since it
    is not kept in a bin.

  SmallBins
    An array of bin headers for free chunks.  These bins hold chunks
    with sizes less than MIN_LARGE_SIZE bytes. Each bin contains
    chunks of all the same size, spaced 8 bytes apart.  To simplify
    use in double-linked lists, each bin header acts as a malloc_chunk
    pointing to the real first node, if it exists (else pointing to
    itself).  This avoids special-casing for headers.  But to avoid
    waste, we allocate only the fd/bk pointers of bins, and then use
    repositioning tricks to treat these as the fields of a chunk.

  TreeBins
    Treebins are pointers to the roots of trees holding a range of
    sizes. There are 2 equally spaced treebins for each power of two
    from TREE_SHIFT to TREE_SHIFT+16. The last bin holds anything
    larger.

  Bin maps
    There is one bit map for small bins ("smallmap") and one for
    treebins ("treemap).  Each bin sets its bit when non-empty, and
    clears the bit when empty.  Bit operations are then used to avoid
    bin-by-bin searching -- nearly all "search" is done without ever
    looking at bins that won't be selected.  The bit maps
    conservatively use 32 bits per map word, even if on 64bit system.
    For a good description of some of the bit-based techniques used
    here, see Henry S. Warren Jr's book "Hacker's Delight" (and
    supplement at http://hackersdelight.org/). Many of these are
    intended to reduce the branchiness of paths through malloc etc, as
    well as to reduce the number of memory locations read or written.

  Segments
    A list of segments headed by an embedded malloc_segment record
    representing the initial space.

  Address check support
    The least_addr field is the least address ever obtained from
    MORECORE or MMAP. Attempted frees and reallocs of any address less
    than this are trapped (unless INSECURE is defined).

  Magic tag
    A cross-check field that should always hold same value as mparams.magic.

  Max allowed footprint
    The maximum allowed bytes to allocate from system (zero means no limit)

  Flags
    Bits recording whether to use MMAP, locks, or contiguous MORECORE

  Statistics
    Each space keeps track of current and maximum system memory
    obtained via MORECORE or MMAP.

  Trim support
    Fields holding the amount of unused topmost memory that should trigger
    trimming, and a counter to force periodic scanning to release unused
    non-topmost segments.

  Locking
    If USE_LOCKS is defined, the "mutex" lock is acquired and released
    around every public call using this mspace.

  Extension support
    A void* pointer and a size_t field that can be used to help implement
    extensions to this malloc.
*/

/* Bin types, widths and sizes */
#define NSMALLBINS        (32U)
#define NTREEBINS         (32U)
#define SMALLBIN_SHIFT    (3U)
#define SMALLBIN_WIDTH    (SIZE_T_ONE << SMALLBIN_SHIFT)
#define TREEBIN_SHIFT     (8U)
#define MIN_LARGE_SIZE    (SIZE_T_ONE << TREEBIN_SHIFT)
#define MAX_SMALL_SIZE    (MIN_LARGE_SIZE - SIZE_T_ONE)
#define MAX_SMALL_REQUEST (MAX_SMALL_SIZE - CHUNK_ALIGN_MASK - CHUNK_OVERHEAD)

struct malloc_state {
  persist<binmap_t>   smallmap;
  persist<binmap_t>   treemap;
  persist<size_t>     dvsize;
  persist<size_t>     topsize;
  persist<char*>      least_addr;
  persist<mchunkptr>  dv;
  persist<mchunkptr>  top;
  persist<size_t>     trim_check;
  persist<size_t>     release_checks;
  persist<size_t>     magic;
  persist<mchunkptr>  smallbins[(NSMALLBINS+1)*2];
  persist<tbinptr>    treebins[NTREEBINS];
  persist<size_t>     footprint;
  persist<size_t>     max_footprint;
  persist<size_t>     footprint_limit; /* zero means no limit */
  persist<flag_t>     mflags;
#if USE_LOCKS
  MLOCK_T    mutex;     /* locate lock among fields that rarely change */
#endif /* USE_LOCKS */
  msegment   seg;
  persist<void*>      extp;      /* Unused but available for extensions */
  persist<size_t>     exts;
};

typedef struct malloc_state*    mstate;

/* ------------- Global malloc_state and malloc_params ------------------- */

/*
  malloc_params holds global properties, including those that can be
  dynamically set using mallopt. There is a single instance, mparams,
  initialized in init_mparams. Note that the non-zeroness of "magic"
  also serves as an initialization flag.
*/

struct malloc_params {
  size_t magic;
  size_t page_size;
  size_t granularity;
  size_t mmap_threshold;
  size_t trim_threshold;
  flag_t default_mflags;
};

static struct malloc_params mparams;

/* Ensure mparams initialized */
#define ensure_initialization() (void)(mparams.magic != 0 || init_mparams())

#if !ONLY_MSPACES

/* The global malloc_state used for all non-"mspace" calls */
static struct malloc_state _gm_;
#define gm                 (&_gm_)
#define is_global(M)       ((M) == &_gm_)

#endif /* !ONLY_MSPACES */

#define is_initialized(M)  ((M)->top != 0)

/* -------------------------- system alloc setup ------------------------- */

/* Operations on mflags */

#define use_lock(M)           ((M)->mflags &   USE_LOCK_BIT)
#define enable_lock(M)        ((M)->mflags |=  USE_LOCK_BIT)
#if USE_LOCKS
#define disable_lock(M)       ((M)->mflags &= ~USE_LOCK_BIT)
#else
#define disable_lock(M)
#endif

#define use_mmap(M)           ((M)->mflags &   (flag_t)USE_MMAP_BIT)
#define enable_mmap(M)        ((M)->mflags |=  (flag_t)USE_MMAP_BIT)
#if HAVE_MMAP
#define disable_mmap(M)       ((M)->mflags &=  (flag_t)~USE_MMAP_BIT)
#else
#define disable_mmap(M)
#endif

#define use_noncontiguous(M)  ((M)->mflags &   USE_NONCONTIGUOUS_BIT)
#define disable_contiguous(M) ((M)->mflags |=  USE_NONCONTIGUOUS_BIT)

#define set_lock(M,L)\
 ((M)->mflags = (L)?\
  ((M)->mflags | USE_LOCK_BIT) :\
  ((M)->mflags & ~USE_LOCK_BIT))

/* page-align a size */
#define page_align(S)\
 (((S) + (mparams.page_size - SIZE_T_ONE)) & ~(mparams.page_size - SIZE_T_ONE))

/* granularity-align a size */
#define granularity_align(S)\
  (((S) + (mparams.granularity - SIZE_T_ONE))\
   & ~(mparams.granularity - SIZE_T_ONE))


/* For mmap, use granularity alignment on windows, else page-align */
#ifdef WIN32
#define mmap_align(S) granularity_align(S)
#else
#define mmap_align(S) page_align(S)
#endif

/* For sys_alloc, enough padding to ensure can malloc request on success */
#define SYS_ALLOC_PADDING (TOP_FOOT_SIZE + MALLOC_ALIGNMENT)

#define is_page_aligned(S)\
   (((size_t)(S) & (mparams.page_size - SIZE_T_ONE)) == 0)
#define is_granularity_aligned(S)\
   (((size_t)(S) & (mparams.granularity - SIZE_T_ONE)) == 0)

/*  True if segment S holds address A */
#define segment_holds(S, A)\
  ((char*)(A) >= S->base && (char*)(A) < S->base + S->size)

/* Return segment holding given address */
static msegmentptr segment_holding(mstate m, char* addr) {
  msegmentptr sp = &m->seg;
  for (;;) {
    if (addr >= sp->base && addr < sp->base + sp->size)
      return sp;
    if ((sp = sp->next) == 0)
      return 0;
  }
}

/* Return true if segment contains a segment link */
static int has_segment_link(mstate m, msegmentptr ss) {
  msegmentptr sp = &m->seg;
  for (;;) {
    if ((char*)sp >= ss->base && (char*)sp < ss->base + ss->size)
      return 1;
    if ((sp = sp->next) == 0)
      return 0;
  }
}

#ifndef MORECORE_CANNOT_TRIM
#define should_trim(M,s)  ((s) > (M)->trim_check)
#else  /* MORECORE_CANNOT_TRIM */
#define should_trim(M,s)  (0)
#endif /* MORECORE_CANNOT_TRIM */

/*
  TOP_FOOT_SIZE is padding at the end of a segment, including space
  that may be needed to place segment records and fenceposts when new
  noncontiguous segments are added.
*/
#define TOP_FOOT_SIZE\
  (align_offset(chunk2mem(0))+pad_request(sizeof(struct malloc_segment))+MIN_CHUNK_SIZE)


/* -------------------------------  Hooks -------------------------------- */

/*
  PREACTION should be defined to return 0 on success, and nonzero on
  failure. If you are not using locking, you can redefine these to do
  anything you like.
*/

#if USE_LOCKS
#define PREACTION(M)  ((use_lock(M))? ACQUIRE_LOCK(&(M)->mutex) : 0)
#define POSTACTION(M) { if (use_lock(M)) RELEASE_LOCK(&(M)->mutex); }
#else /* USE_LOCKS */

#ifndef PREACTION
#define PREACTION(M) (0)
#endif  /* PREACTION */

#ifndef POSTACTION
#define POSTACTION(M)
#endif  /* POSTACTION */

#endif /* USE_LOCKS */

/*
  CORRUPTION_ERROR_ACTION is triggered upon detected bad addresses.
  USAGE_ERROR_ACTION is triggered on detected bad frees and
  reallocs. The argument p is an address that might have triggered the
  fault. It is ignored by the two predefined actions, but might be
  useful in custom actions that try to help diagnose errors.
*/

#if PROCEED_ON_ERROR

/* A count of the number of corruption errors causing resets */
int malloc_corruption_error_count;

/* default corruption action */
static void reset_on_error(mstate m);

#define CORRUPTION_ERROR_ACTION(m)  reset_on_error(m)
#define USAGE_ERROR_ACTION(m, p)

#else /* PROCEED_ON_ERROR */

#ifndef CORRUPTION_ERROR_ACTION
#define CORRUPTION_ERROR_ACTION(m) ABORT
#endif /* CORRUPTION_ERROR_ACTION */

#ifndef USAGE_ERROR_ACTION
#define USAGE_ERROR_ACTION(m,p) ABORT
#endif /* USAGE_ERROR_ACTION */

#endif /* PROCEED_ON_ERROR */


/* -------------------------- Debugging setup ---------------------------- */

#if ! DEBUG

#define check_free_chunk(M,P)
#define check_inuse_chunk(M,P)
#define check_malloced_chunk(M,P,N)
#define check_mmapped_chunk(M,P)
#define check_malloc_state(M)
#define check_top_chunk(M,P)

#else /* DEBUG */
#define check_free_chunk(M,P)       do_check_free_chunk(M,P)
#define check_inuse_chunk(M,P)      do_check_inuse_chunk(M,P)
#define check_top_chunk(M,P)        do_check_top_chunk(M,P)
#define check_malloced_chunk(M,P,N) do_check_malloced_chunk(M,P,N)
#define check_mmapped_chunk(M,P)    do_check_mmapped_chunk(M,P)
#define check_malloc_state(M)       do_check_malloc_state(M)

static void   do_check_any_chunk(mstate m, mchunkptr p);
static void   do_check_top_chunk(mstate m, mchunkptr p);
static void   do_check_mmapped_chunk(mstate m, mchunkptr p);
static void   do_check_inuse_chunk(mstate m, mchunkptr p);
static void   do_check_free_chunk(mstate m, mchunkptr p);
static void   do_check_malloced_chunk(mstate m, void* mem, size_t s);
static void   do_check_tree(mstate m, tchunkptr t);
static void   do_check_treebin(mstate m, bindex_t i);
static void   do_check_smallbin(mstate m, bindex_t i);
static void   do_check_malloc_state(mstate m);
static int    bin_find(mstate m, mchunkptr x);
static size_t traverse_and_check(mstate m);
#endif /* DEBUG */

/* ---------------------------- Indexing Bins ---------------------------- */

#define is_small(s)         (((s) >> SMALLBIN_SHIFT) < NSMALLBINS)
#define small_index(s)      (bindex_t)((s)  >> SMALLBIN_SHIFT)
#define small_index2size(i) ((i)  << SMALLBIN_SHIFT)
#define MIN_SMALL_INDEX     (small_index(MIN_CHUNK_SIZE))

/* addressing by index. See above about smallbin repositioning */
#define smallbin_at(M, i)   ((sbinptr)((char*)&((M)->smallbins[(i)<<1])))
#define treebin_at(M,i)     ((persist<tbinptr>*)&((M)->treebins[i]))

/* assign tree index for size S to variable I. Use x86 asm if possible  */
#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
#define compute_tree_index(S, I)\
{\
  unsigned int X = S >> TREEBIN_SHIFT;\
  if (X == 0)\
    I = 0;\
  else if (X > 0xFFFF)\
    I = NTREEBINS-1;\
  else {\
    unsigned int K = (unsigned) sizeof(X)*__CHAR_BIT__ - 1 - (unsigned) __builtin_clz(X); \
    I =  (bindex_t)((K << 1) + ((S >> (K + (TREEBIN_SHIFT-1)) & 1)));\
  }\
}

#elif defined (__INTEL_COMPILER)
#define compute_tree_index(S, I)\
{\
  size_t X = S >> TREEBIN_SHIFT;\
  if (X == 0)\
    I = 0;\
  else if (X > 0xFFFF)\
    I = NTREEBINS-1;\
  else {\
    unsigned int K = _bit_scan_reverse (X); \
    I =  (bindex_t)((K << 1) + ((S >> (K + (TREEBIN_SHIFT-1)) & 1)));\
  }\
}

#elif defined(_MSC_VER) && _MSC_VER>=1300
#define compute_tree_index(S, I)\
{\
  size_t X = S >> TREEBIN_SHIFT;\
  if (X == 0)\
    I = 0;\
  else if (X > 0xFFFF)\
    I = NTREEBINS-1;\
  else {\
    unsigned int K;\
    _BitScanReverse((DWORD *) &K, (DWORD) X);\
    I =  (bindex_t)((K << 1) + ((S >> (K + (TREEBIN_SHIFT-1)) & 1)));\
  }\
}

#else /* GNUC */
#define compute_tree_index(S, I)\
{\
  size_t X = S >> TREEBIN_SHIFT;\
  if (X == 0)\
    I = 0;\
  else if (X > 0xFFFF)\
    I = NTREEBINS-1;\
  else {\
    unsigned int Y = (unsigned int)X;\
    unsigned int N = ((Y - 0x100) >> 16) & 8;\
    unsigned int K = (((Y <<= N) - 0x1000) >> 16) & 4;\
    N += K;\
    N += K = (((Y <<= K) - 0x4000) >> 16) & 2;\
    K = 14 - N + ((Y <<= K) >> 15);\
    I = (K << 1) + ((S >> (K + (TREEBIN_SHIFT-1)) & 1));\
  }\
}
#endif /* GNUC */

/* Bit representing maximum resolved size in a treebin at i */
#define bit_for_tree_index(i) \
   (i == NTREEBINS-1)? (SIZE_T_BITSIZE-1) : (((i) >> 1) + TREEBIN_SHIFT - 2)

/* Shift placing maximum resolved bit in a treebin at i as sign bit */
#define leftshift_for_tree_index(i) \
   ((i == NTREEBINS-1)? 0 : \
    ((SIZE_T_BITSIZE-SIZE_T_ONE) - (((i) >> 1) + TREEBIN_SHIFT - 2)))

/* The size of the smallest chunk held in bin with index i */
#define minsize_for_tree_index(i) \
   ((SIZE_T_ONE << (((i) >> 1) + TREEBIN_SHIFT)) |  \
   (((size_t)((i) & SIZE_T_ONE)) << (((i) >> 1) + TREEBIN_SHIFT - 1)))


/* ------------------------ Operations on bin maps ----------------------- */

/* bit corresponding to given index */
#define idx2bit(i)              ((binmap_t)(1) << (i))

/* Mark/Clear bits with given index */
#define mark_smallmap(M,i)      ((M)->smallmap |=  idx2bit(i))
#define clear_smallmap(M,i)     ((M)->smallmap &= ~idx2bit(i))
#define smallmap_is_marked(M,i) ((M)->smallmap &   idx2bit(i))

#define mark_treemap(M,i)       ((M)->treemap  |=  idx2bit(i))
#define clear_treemap(M,i)      ((M)->treemap  &= ~idx2bit(i))
#define treemap_is_marked(M,i)  ((M)->treemap  &   idx2bit(i))

/* isolate the least set bit of a bitmap */
#define least_bit(x)         ((x) & -(x))

/* mask with all bits to left of least bit of x on */
#define left_bits(x)         ((x<<1) | -(x<<1))

/* mask with all bits to left of or equal to least bit of x on */
#define same_or_left_bits(x) ((x) | -(x))

/* index corresponding to given bit. Use x86 asm if possible */

#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
#define compute_bit2idx(X, I)\
{\
  unsigned int J;\
  J = __builtin_ctz(X); \
  I = (bindex_t)J;\
}

#elif defined (__INTEL_COMPILER)
#define compute_bit2idx(X, I)\
{\
  unsigned int J;\
  J = _bit_scan_forward (X); \
  I = (bindex_t)J;\
}

#elif defined(_MSC_VER) && _MSC_VER>=1300
#define compute_bit2idx(X, I)\
{\
  unsigned int J;\
  _BitScanForward((DWORD *) &J, X);\
  I = (bindex_t)J;\
}

#elif USE_BUILTIN_FFS
#define compute_bit2idx(X, I) I = ffs(X)-1

#else
#define compute_bit2idx(X, I)\
{\
  unsigned int Y = X - 1;\
  unsigned int K = Y >> (16-4) & 16;\
  unsigned int N = K;        Y >>= K;\
  N += K = Y >> (8-3) &  8;  Y >>= K;\
  N += K = Y >> (4-2) &  4;  Y >>= K;\
  N += K = Y >> (2-1) &  2;  Y >>= K;\
  N += K = Y >> (1-0) &  1;  Y >>= K;\
  I = (bindex_t)(N + Y);\
}
#endif /* GNUC */


/* ----------------------- Runtime Check Support ------------------------- */

/*
  For security, the main invariant is that malloc/free/etc never
  writes to a static address other than malloc_state, unless static
  malloc_state itself has been corrupted, which cannot occur via
  malloc (because of these checks). In essence this means that we
  believe all pointers, sizes, maps etc held in malloc_state, but
  check all of those linked or offsetted from other embedded data
  structures.  These checks are interspersed with main code in a way
  that tends to minimize their run-time cost.

  When FOOTERS is defined, in addition to range checking, we also
  verify footer fields of inuse chunks, which can be used guarantee
  that the mstate controlling malloc/free is intact.  This is a
  streamlined version of the approach described by William Robertson
  et al in "Run-time Detection of Heap-based Overflows" LISA'03
  http://www.usenix.org/events/lisa03/tech/robertson.html The footer
  of an inuse chunk holds the xor of its mstate and a random seed,
  that is checked upon calls to free() and realloc().  This is
  (probabalistically) unguessable from outside the program, but can be
  computed by any code successfully malloc'ing any chunk, so does not
  itself provide protection against code that has already broken
  security through some other means.  Unlike Robertson et al, we
  always dynamically check addresses of all offset chunks (previous,
  next, etc). This turns out to be cheaper than relying on hashes.
*/

#if !INSECURE
/* Check if address a is at least as high as any from MORECORE or MMAP */
#define ok_address(M, a) ((char*)(a) >= (M)->least_addr)
/* Check if address of next chunk n is higher than base chunk p */
#define ok_next(p, n)    ((char*)(p) < (char*)(n))
/* Check if p has inuse status */
#define ok_inuse(p)     is_inuse(p)
/* Check if p has its pinuse bit on */
#define ok_pinuse(p)     pinuse(p)

#else /* !INSECURE */
#define ok_address(M, a) (1)
#define ok_next(b, n)    (1)
#define ok_inuse(p)      (1)
#define ok_pinuse(p)     (1)
#endif /* !INSECURE */

#if (FOOTERS && !INSECURE)
/* Check if (alleged) mstate m has expected magic field */
#define ok_magic(M)      ((M)->magic == mparams.magic)
#else  /* (FOOTERS && !INSECURE) */
#define ok_magic(M)      (1)
#endif /* (FOOTERS && !INSECURE) */

/* In gcc, use __builtin_expect to minimize impact of checks */
#if !INSECURE
#if defined(__GNUC__) && __GNUC__ >= 3
#define RTCHECK(e)  __builtin_expect(e, 1)
#else /* GNUC */
#define RTCHECK(e)  (e)
#endif /* GNUC */
#else /* !INSECURE */
#define RTCHECK(e)  (1)
#endif /* !INSECURE */

/* macros to set up inuse chunks with or without footers */

#if !FOOTERS

#define mark_inuse_foot(M,p,s)

/* Macros for setting head/foot of non-mmapped chunks */

/* Set cinuse bit and pinuse bit of next chunk */
#define set_inuse(M,p,s)\
  ((p)->head = (((p)->head & PINUSE_BIT)|s|CINUSE_BIT),\
  ((mchunkptr)(((char*)(p)) + (s)))->head |= PINUSE_BIT)

/* Set cinuse and pinuse of this chunk and pinuse of next chunk */
#define set_inuse_and_pinuse(M,p,s)\
  ((p)->head = (s|PINUSE_BIT|CINUSE_BIT),\
  ((mchunkptr)(((char*)(p)) + (s)))->head |= PINUSE_BIT)

/* Set size, cinuse and pinuse bit of this chunk */
#define set_size_and_pinuse_of_inuse_chunk(M, p, s)\
  ((p)->head = (s|PINUSE_BIT|CINUSE_BIT))

#else /* FOOTERS */

/* Set foot of inuse chunk to be xor of mstate and seed */
#define mark_inuse_foot(M,p,s)\
  (((mchunkptr)((char*)(p) + (s)))->prev_foot = ((size_t)(M) ^ mparams.magic))

#define get_mstate_for(p)\
  ((mstate)(((mchunkptr)((char*)(p) +\
    (chunksize(p))))->prev_foot ^ mparams.magic))

#define set_inuse(M,p,s)\
  ((p)->head = (((p)->head & PINUSE_BIT)|s|CINUSE_BIT),\
  (((mchunkptr)(((char*)(p)) + (s)))->head |= PINUSE_BIT), \
  mark_inuse_foot(M,p,s))

#define set_inuse_and_pinuse(M,p,s)\
  ((p)->head = (s|PINUSE_BIT|CINUSE_BIT),\
  (((mchunkptr)(((char*)(p)) + (s)))->head |= PINUSE_BIT),\
 mark_inuse_foot(M,p,s))

#define set_size_and_pinuse_of_inuse_chunk(M, p, s)\
  ((p)->head = (s|PINUSE_BIT|CINUSE_BIT),\
  mark_inuse_foot(M, p, s))

#endif /* !FOOTERS */

/* ---------------------------- setting mparams -------------------------- */

#if LOCK_AT_FORK
static void pre_fork(void)         { ACQUIRE_LOCK(&(gm)->mutex); }
static void post_fork_parent(void) { RELEASE_LOCK(&(gm)->mutex); }
static void post_fork_child(void)  { INITIAL_LOCK(&(gm)->mutex); }
#endif /* LOCK_AT_FORK */

/* Initialize mparams */
static int init_mparams(void) {
#ifdef NEED_GLOBAL_LOCK_INIT
  if (malloc_global_mutex_status <= 0)
    init_malloc_global_mutex();
#endif

  ACQUIRE_MALLOC_GLOBAL_LOCK();
  if (mparams.magic == 0) {
    size_t magic;
    size_t psize;
    size_t gsize;

#ifndef WIN32
    psize = malloc_getpagesize;
    gsize = ((DEFAULT_GRANULARITY != 0)? DEFAULT_GRANULARITY : psize);
#else /* WIN32 */
    {
      SYSTEM_INFO system_info;
      GetSystemInfo(&system_info);
      psize = system_info.dwPageSize;
      gsize = ((DEFAULT_GRANULARITY != 0)?
               DEFAULT_GRANULARITY : system_info.dwAllocationGranularity);
    }
#endif /* WIN32 */

    /* Sanity-check configuration:
       size_t must be unsigned and as wide as pointer type.
       ints must be at least 4 bytes.
       alignment must be at least 8.
       Alignment, min chunk size, and page size must all be powers of 2.
    */
    if ((sizeof(size_t) != sizeof(char*)) ||
        (MAX_SIZE_T < MIN_CHUNK_SIZE)  ||
        (sizeof(int) < 4)  ||
        (MALLOC_ALIGNMENT < (size_t)8U) ||
        ((MALLOC_ALIGNMENT & (MALLOC_ALIGNMENT-SIZE_T_ONE)) != 0) ||
        ((MCHUNK_SIZE      & (MCHUNK_SIZE-SIZE_T_ONE))      != 0) ||
        ((gsize            & (gsize-SIZE_T_ONE))            != 0) ||
        ((psize            & (psize-SIZE_T_ONE))            != 0))
      ABORT;
    mparams.granularity = gsize;
    mparams.page_size = psize;
    mparams.mmap_threshold = DEFAULT_MMAP_THRESHOLD;
    mparams.trim_threshold = DEFAULT_TRIM_THRESHOLD;
#if MORECORE_CONTIGUOUS
    mparams.default_mflags = USE_LOCK_BIT|USE_MMAP_BIT;
#else  /* MORECORE_CONTIGUOUS */
    mparams.default_mflags = USE_LOCK_BIT|USE_MMAP_BIT|USE_NONCONTIGUOUS_BIT;
#endif /* MORECORE_CONTIGUOUS */

#if !ONLY_MSPACES
    /* Set up lock for main malloc area */
    gm->mflags = mparams.default_mflags;
    (void)INITIAL_LOCK(&gm->mutex);
#endif
#if LOCK_AT_FORK
    pthread_atfork(&pre_fork, &post_fork_parent, &post_fork_child);
#endif

    {
#if USE_DEV_RANDOM
      int fd;
      unsigned char buf[sizeof(size_t)];
      /* Try to use /dev/urandom, else fall back on using time */
      if ((fd = open("/dev/urandom", O_RDONLY)) >= 0 &&
          read(fd, buf, sizeof(buf)) == sizeof(buf)) {
        magic = *((size_t *) buf);
        close(fd);
      }
      else
#endif /* USE_DEV_RANDOM */
#ifdef WIN32
      magic = (size_t)(GetTickCount() ^ (size_t)0x55555555U);
#elif defined(LACKS_TIME_H)
      magic = (size_t)&magic ^ (size_t)0x55555555U;
#else
      magic = (size_t)(time(0) ^ (size_t)0x55555555U);
#endif
      magic |= (size_t)8U;    /* ensure nonzero */
      magic &= ~(size_t)7U;   /* improve chances of fault for bad values */
      /* Until memory modes commonly available, use volatile-write */
      (*(volatile size_t *)(&(mparams.magic))) = magic;
    }
  }

  RELEASE_MALLOC_GLOBAL_LOCK();
  return 1;
}

/* support for mallopt */
static int change_mparam(int param_number, int value) {
  size_t val;
  ensure_initialization();
  val = (value == -1)? MAX_SIZE_T : (size_t)value;
  switch(param_number) {
  case M_TRIM_THRESHOLD:
    mparams.trim_threshold = val;
    return 1;
  case M_GRANULARITY:
    if (val >= mparams.page_size && ((val & (val-1)) == 0)) {
      mparams.granularity = val;
      return 1;
    }
    else
      return 0;
  case M_MMAP_THRESHOLD:
    mparams.mmap_threshold = val;
    return 1;
  default:
    return 0;
  }
}

#if DEBUG
/* ------------------------- Debugging Support --------------------------- */

/* Check properties of any chunk, whether free, inuse, mmapped etc  */
static void do_check_any_chunk(mstate m, mchunkptr p) {
  assert((is_aligned(chunk2mem(p))) || (p->head == FENCEPOST_HEAD));
  assert(ok_address(m, p));
}

/* Check properties of top chunk */
static void do_check_top_chunk(mstate m, mchunkptr p) {
  msegmentptr sp = segment_holding(m, (char*)p);
  size_t  sz = p->head & ~INUSE_BITS; /* third-lowest bit can be set! */
  assert(sp != 0);
  assert((is_aligned(chunk2mem(p))) || (p->head == FENCEPOST_HEAD));
  assert(ok_address(m, p));
  assert(sz == m->topsize);
  assert(sz > 0);
  assert(sz == ((sp->base + sp->size) - (char*)p) - TOP_FOOT_SIZE);
  assert(pinuse(p));
  assert(!pinuse(chunk_plus_offset(p, sz)));
}

/* Check properties of (inuse) mmapped chunks */
static void do_check_mmapped_chunk(mstate m, mchunkptr p) {
  size_t  sz = chunksize(p);
  size_t len = (sz + (p->prev_foot) + MMAP_FOOT_PAD);
  assert(is_mmapped(p));
  assert(use_mmap(m));
  assert((is_aligned(chunk2mem(p))) || (p->head == FENCEPOST_HEAD));
  assert(ok_address(m, p));
  assert(!is_small(sz));
  assert((len & (mparams.page_size-SIZE_T_ONE)) == 0);
  assert(chunk_plus_offset(p, sz)->head == FENCEPOST_HEAD);
  assert(chunk_plus_offset(p, sz+SIZE_T_SIZE)->head == 0);
}

/* Check properties of inuse chunks */
static void do_check_inuse_chunk(mstate m, mchunkptr p) {
  do_check_any_chunk(m, p);
  assert(is_inuse(p));
  assert(next_pinuse(p));
  /* If not pinuse and not mmapped, previous chunk has OK offset */
  assert(is_mmapped(p) || pinuse(p) || next_chunk(prev_chunk(p)) == p);
  if (is_mmapped(p))
    do_check_mmapped_chunk(m, p);
}

/* Check properties of free chunks */
static void do_check_free_chunk(mstate m, mchunkptr p) {
  size_t sz = chunksize(p);
  mchunkptr next = chunk_plus_offset(p, sz);
  do_check_any_chunk(m, p);
  assert(!is_inuse(p));
  assert(!next_pinuse(p));
  assert (!is_mmapped(p));
  if (p != m->dv && p != m->top) {
    if (sz >= MIN_CHUNK_SIZE) {
      assert((sz & CHUNK_ALIGN_MASK) == 0);
      assert(is_aligned(chunk2mem(p)));
      assert(next->prev_foot == sz);
      assert(pinuse(p));
      assert (next == m->top || is_inuse(next));
      assert(p->fd->bk == p);
      assert(p->bk->fd == p);
    }
    else  /* markers are always of size SIZE_T_SIZE */
      assert(sz == SIZE_T_SIZE);
  }
}

/* Check properties of malloced chunks at the point they are malloced */
static void do_check_malloced_chunk(mstate m, void* mem, size_t s) {
  if (mem != 0) {
    mchunkptr p = mem2chunk(mem);
    size_t sz = p->head & ~INUSE_BITS;
    do_check_inuse_chunk(m, p);
    assert((sz & CHUNK_ALIGN_MASK) == 0);
    assert(sz >= MIN_CHUNK_SIZE);
    assert(sz >= s);
    /* unless mmapped, size is less than MIN_CHUNK_SIZE more than request */
    assert(is_mmapped(p) || sz < (s + MIN_CHUNK_SIZE));
  }
}

/* Check a tree and its subtrees.  */
static void do_check_tree(mstate m, tchunkptr t) {
  tchunkptr head = 0;
  tchunkptr u = t;
  bindex_t tindex = t->index;
  size_t tsize = chunksize(t);
  bindex_t idx;
  compute_tree_index(tsize, idx);
  assert(tindex == idx);
  assert(tsize >= MIN_LARGE_SIZE);
  assert(tsize >= minsize_for_tree_index(idx));
  assert((idx == NTREEBINS-1) || (tsize < minsize_for_tree_index((idx+1))));

  do { /* traverse through chain of same-sized nodes */
    do_check_any_chunk(m, ((mchunkptr)u));
    assert(u->index == tindex);
    assert(chunksize(u) == tsize);
    assert(!is_inuse(u));
    assert(!next_pinuse(u));
    assert(u->fd->bk == u);
    assert(u->bk->fd == u);
    if (u->parent == 0) {
      assert(u->child[0] == 0);
      assert(u->child[1] == 0);
    }
    else {
      assert(head == 0); /* only one node on chain has parent */
      head = u;
      assert(u->parent != u);
      assert (u->parent->child[0] == u ||
              u->parent->child[1] == u ||
              *((tbinptr*)(u->parent)) == u);
      if (u->child[0] != 0) {
        assert(u->child[0]->parent == u);
        assert(u->child[0] != u);
        do_check_tree(m, u->child[0]);
      }
      if (u->child[1] != 0) {
        assert(u->child[1]->parent == u);
        assert(u->child[1] != u);
        do_check_tree(m, u->child[1]);
      }
      if (u->child[0] != 0 && u->child[1] != 0) {
        assert(chunksize(u->child[0]) < chunksize(u->child[1]));
      }
    }
    u = u->fd;
  } while (u != t);
  assert(head != 0);
}

/*  Check all the chunks in a treebin.  */
static void do_check_treebin(mstate m, bindex_t i) {
  tbinptr* tb = treebin_at(m, i);
  tchunkptr t = *tb;
  int empty = (m->treemap & (1U << i)) == 0;
  if (t == 0)
    assert(empty);
  if (!empty)
    do_check_tree(m, t);
}

/*  Check all the chunks in a smallbin.  */
static void do_check_smallbin(mstate m, bindex_t i) {
  sbinptr b = smallbin_at(m, i);
  mchunkptr p = b->bk;
  unsigned int empty = (m->smallmap & (1U << i)) == 0;
  if (p == b)
    assert(empty);
  if (!empty) {
    for (; p != b; p = p->bk) {
      size_t size = chunksize(p);
      mchunkptr q;
      /* each chunk claims to be free */
      do_check_free_chunk(m, p);
      /* chunk belongs in bin */
      assert(small_index(size) == i);
      assert(p->bk == b || chunksize(p->bk) == chunksize(p));
      /* chunk is followed by an inuse chunk */
      q = next_chunk(p);
      if (q->head != FENCEPOST_HEAD)
        do_check_inuse_chunk(m, q);
    }
  }
}

/* Find x in a bin. Used in other check functions. */
static int bin_find(mstate m, mchunkptr x) {
  size_t size = chunksize(x);
  if (is_small(size)) {
    bindex_t sidx = small_index(size);
    sbinptr b = smallbin_at(m, sidx);
    if (smallmap_is_marked(m, sidx)) {
      mchunkptr p = b;
      do {
        if (p == x)
          return 1;
      } while ((p = p->fd) != b);
    }
  }
  else {
    bindex_t tidx;
    compute_tree_index(size, tidx);
    if (treemap_is_marked(m, tidx)) {
      tchunkptr t = *treebin_at(m, tidx);
      size_t sizebits = size << leftshift_for_tree_index(tidx);
      while (t != 0 && chunksize(t) != size) {
        t = t->child[(sizebits >> (SIZE_T_BITSIZE-SIZE_T_ONE)) & 1];
        sizebits <<= 1;
      }
      if (t != 0) {
        tchunkptr u = t;
        do {
          if (u == (tchunkptr)x)
            return 1;
        } while ((u = u->fd) != t);
      }
    }
  }
  return 0;
}

/* Traverse each chunk and check it; return total */
static size_t traverse_and_check(mstate m) {
  size_t sum = 0;
  if (is_initialized(m)) {
    msegmentptr s = &m->seg;
    sum += m->topsize + TOP_FOOT_SIZE;
    while (s != 0) {
      mchunkptr q = align_as_chunk(s->base);
      mchunkptr lastq = 0;
      assert(pinuse(q));
      while (segment_holds(s, q) &&
             q != m->top && q->head != FENCEPOST_HEAD) {
        sum += chunksize(q);
        if (is_inuse(q)) {
          assert(!bin_find(m, q));
          do_check_inuse_chunk(m, q);
        }
        else {
          assert(q == m->dv || bin_find(m, q));
          assert(lastq == 0 || is_inuse(lastq)); /* Not 2 consecutive free */
          do_check_free_chunk(m, q);
        }
        lastq = q;
        q = next_chunk(q);
      }
      s = s->next;
    }
  }
  return sum;
}


/* Check all properties of malloc_state. */
static void do_check_malloc_state(mstate m) {
  bindex_t i;
  size_t total;
  /* check bins */
  for (i = 0; i < NSMALLBINS; ++i)
    do_check_smallbin(m, i);
  for (i = 0; i < NTREEBINS; ++i)
    do_check_treebin(m, i);

  if (m->dvsize != 0) { /* check dv chunk */
    do_check_any_chunk(m, m->dv);
    assert(m->dvsize == chunksize(m->dv));
    assert(m->dvsize >= MIN_CHUNK_SIZE);
    assert(bin_find(m, m->dv) == 0);
  }

  if (m->top != 0) {   /* check top chunk */
    do_check_top_chunk(m, m->top);
    /*assert(m->topsize == chunksize(m->top)); redundant */
    assert(m->topsize > 0);
    assert(bin_find(m, m->top) == 0);
  }

  total = traverse_and_check(m);
  assert(total <= m->footprint);
  assert(m->footprint <= m->max_footprint);
}
#endif /* DEBUG */

/* ----------------------------- statistics ------------------------------ */

#if !NO_MALLINFO
static struct mallinfo internal_mallinfo(mstate m) {
  struct mallinfo nm = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
  ensure_initialization();
  if (!PREACTION(m)) {
    check_malloc_state(m);
    if (is_initialized(m)) {
      size_t nfree = SIZE_T_ONE; /* top always free */
      size_t mfree = m->topsize + TOP_FOOT_SIZE;
      size_t sum = mfree;
      msegmentptr s = &m->seg;
      while (s != 0) {
        mchunkptr q = align_as_chunk(s->base);
        while (segment_holds(s, q) &&
               q != m->top && q->head != FENCEPOST_HEAD) {
          size_t sz = chunksize(q);
          sum += sz;
          if (!is_inuse(q)) {
            mfree += sz;
            ++nfree;
          }
          q = next_chunk(q);
        }
        s = s->next;
      }

      nm.arena    = sum;
      nm.ordblks  = nfree;
      nm.hblkhd   = m->footprint - sum;
      nm.usmblks  = m->max_footprint;
      nm.uordblks = m->footprint - mfree;
      nm.fordblks = mfree;
      nm.keepcost = m->topsize;
    }

    POSTACTION(m);
  }
  return nm;
}
#endif /* !NO_MALLINFO */

#if !NO_MALLOC_STATS
static void internal_malloc_stats(mstate m) {
  ensure_initialization();
  if (!PREACTION(m)) {
    size_t maxfp = 0;
    size_t fp = 0;
    size_t used = 0;
    check_malloc_state(m);
    if (is_initialized(m)) {
      msegmentptr s = &m->seg;
      maxfp = m->max_footprint;
      fp = m->footprint;
      used = fp - (m->topsize + TOP_FOOT_SIZE);

      while (s != 0) {
        mchunkptr q = align_as_chunk(s->base);
        while (segment_holds(s, q) &&
               q != m->top && q->head != FENCEPOST_HEAD) {
          if (!is_inuse(q))
            used -= chunksize(q);
          q = next_chunk(q);
        }
        s = s->next;
      }
    }
    POSTACTION(m); /* drop lock */
    fprintf(stderr, "max system bytes = %10lu\n", (unsigned long)(maxfp));
    fprintf(stderr, "system bytes     = %10lu\n", (unsigned long)(fp));
    fprintf(stderr, "in use bytes     = %10lu\n", (unsigned long)(used));
  }
}
#endif /* NO_MALLOC_STATS */

/* ----------------------- Operations on smallbins ----------------------- */

/*
  Various forms of linking and unlinking are defined as macros.  Even
  the ones for trees, which are very long but have very short typical
  paths.  This is ugly but reduces reliance on inlining support of
  compilers.
*/

/* Link a free chunk into a smallbin  */
#define insert_small_chunk(M, P, S) {\
  bindex_t I  = small_index(S);\
  mchunkptr B = smallbin_at(M, I);\
  mchunkptr F = B;\
  assert(S >= MIN_CHUNK_SIZE);\
  if (!smallmap_is_marked(M, I))\
    mark_smallmap(M, I);\
  else if (RTCHECK(ok_address(M, B->fd.pload())))\
    F = B->fd;\
  else {\
    CORRUPTION_ERROR_ACTION(M);\
  }\
  B->fd = P;\
  F->bk = P;\
  P->fd = F;\
  P->bk = B;\
}

/* Unlink a chunk from a smallbin  */
#define unlink_small_chunk(M, P, S) {\
  mchunkptr F = P->fd;\
  mchunkptr B = P->bk;\
  bindex_t I = small_index(S);\
  assert(P != B);\
  assert(P != F);\
  assert(chunksize(P) == small_index2size(I));\
  if (RTCHECK(F == smallbin_at(M,I) || (ok_address(M, F) && F->bk == P))) { \
    if (B == F) {\
      clear_smallmap(M, I);\
    }\
    else if (RTCHECK(B == smallbin_at(M,I) ||\
                     (ok_address(M, B) && B->fd == P))) {\
      F->bk = B;\
      B->fd = F;\
    }\
    else {\
      CORRUPTION_ERROR_ACTION(M);\
    }\
  }\
  else {\
    CORRUPTION_ERROR_ACTION(M);\
  }\
}

/* Unlink the first chunk from a smallbin */
#define unlink_first_small_chunk(M, B, P, I) {\
  mchunkptr F = P->fd;\
  assert(P != B);\
  assert(P != F);\
  assert(chunksize(P) == small_index2size(I));\
  if (B == F) {\
    clear_smallmap(M, I);\
  }\
  else if (RTCHECK(ok_address(M, F) && F->bk == P)) {\
    F->bk = B;\
    B->fd = F;\
  }\
  else {\
    CORRUPTION_ERROR_ACTION(M);\
  }\
}

/* Replace dv node, binning the old one */
/* Used only when dvsize known to be small */
#define replace_dv(M, P, S) {\
  size_t DVS = M->dvsize;\
  assert(is_small(DVS));\
  if (DVS != 0) {\
    mchunkptr DV = M->dv;\
    insert_small_chunk(M, DV, DVS);\
  }\
  M->dvsize = S;\
  M->dv = P;\
}

/* ------------------------- Operations on trees ------------------------- */

/* Insert chunk into tree */
#define insert_large_chunk(M, X, S) {\
  persist<tbinptr>* H;\
  bindex_t I;\
  compute_tree_index(S, I);\
  H = treebin_at(M, I);\
  X->index = I;\
  X->child[0] = X->child[1] = 0;\
  if (!treemap_is_marked(M, I)) {\
    mark_treemap(M, I);\
    *H = X;\
    X->parent = (tchunkptr)H;\
    X->fd = X->bk = X;\
  }\
  else {\
    tchunkptr T = *H;\
    size_t K = S << leftshift_for_tree_index(I);\
    for (;;) {\
      if (chunksize(T) != S) {\
        persist<tchunkptr>* C = (persist<tchunkptr>*)&(T->child[(K >> (SIZE_T_BITSIZE-SIZE_T_ONE)) & 1]);\
        K <<= 1;\
        if (*C != 0)\
          T = *C;\
        else if (RTCHECK(ok_address(M, C))) {\
          *C = X;\
          X->parent = T;\
          X->fd = X->bk = X;\
          break;\
        }\
        else {\
          CORRUPTION_ERROR_ACTION(M);\
          break;\
        }\
      }\
      else {\
        tchunkptr F = T->fd;\
        if (RTCHECK(ok_address(M, T) && ok_address(M, F))) {\
          T->fd = F->bk = X;\
          X->fd = F;\
          X->bk = T;\
          X->parent = 0;\
          break;\
        }\
        else {\
          CORRUPTION_ERROR_ACTION(M);\
          break;\
        }\
      }\
    }\
  }\
}

/*
  Unlink steps:

  1. If x is a chained node, unlink it from its same-sized fd/bk links
     and choose its bk node as its replacement.
  2. If x was the last node of its size, but not a leaf node, it must
     be replaced with a leaf node (not merely one with an open left or
     right), to make sure that lefts and rights of descendents
     correspond properly to bit masks.  We use the rightmost descendent
     of x.  We could use any other leaf, but this is easy to locate and
     tends to counteract removal of leftmosts elsewhere, and so keeps
     paths shorter than minimally guaranteed.  This doesn't loop much
     because on average a node in a tree is near the bottom.
  3. If x is the base of a chain (i.e., has parent links) relink
     x's parent and children to x's replacement (or null if none).
*/

#define unlink_large_chunk(M, X) {\
  tchunkptr XP = X->parent;\
  tchunkptr R;\
  if (X->bk != X) {\
    tchunkptr F = X->fd;\
    R = X->bk;\
    if (RTCHECK(ok_address(M, F) && F->bk == X && R->fd == X)) {\
      F->bk = R;\
      R->fd = F;\
    }\
    else {\
      CORRUPTION_ERROR_ACTION(M);\
    }\
  }\
  else {\
    persist<tchunkptr>* RP;\
    if (((R = *(RP = (persist<tchunkptr>*)&(X->child[1]))) != 0) ||\
        ((R = *(RP = (persist<tchunkptr>*)&(X->child[0]))) != 0)) {\
      persist<tchunkptr>* CP;\
      while ((*(CP = (persist<tchunkptr>*)&(R->child[1])) != 0) ||\
             (*(CP = (persist<tchunkptr>*)&(R->child[0])) != 0)) {\
        R = *(RP = CP);\
      }\
      if (RTCHECK(ok_address(M, RP)))\
        *RP = 0;\
      else {\
        CORRUPTION_ERROR_ACTION(M);\
      }\
    }\
  }\
  if (XP != 0) {\
    persist<tbinptr>* H = treebin_at(M, X->index);\
    if (X == *H) {\
      if ((*H = R) == 0) \
        clear_treemap(M, X->index);\
    }\
    else if (RTCHECK(ok_address(M, XP))) {\
      if (XP->child[0] == X) \
        XP->child[0] = R;\
      else \
        XP->child[1] = R;\
    }\
    else\
      CORRUPTION_ERROR_ACTION(M);\
    if (R != 0) {\
      if (RTCHECK(ok_address(M, R))) {\
        tchunkptr C0, C1;\
        R->parent = XP;\
        if ((C0 = X->child[0]) != 0) {\
          if (RTCHECK(ok_address(M, C0))) {\
            R->child[0] = C0;\
            C0->parent = R;\
          }\
          else\
            CORRUPTION_ERROR_ACTION(M);\
        }\
        if ((C1 = X->child[1]) != 0) {\
          if (RTCHECK(ok_address(M, C1))) {\
            R->child[1] = C1;\
            C1->parent = R;\
          }\
          else\
            CORRUPTION_ERROR_ACTION(M);\
        }\
      }\
      else\
        CORRUPTION_ERROR_ACTION(M);\
    }\
  }\
}

/* Relays to large vs small bin operations */

#define insert_chunk(M, P, S)\
  if (is_small(S)) insert_small_chunk(M, P, S)\
  else { tchunkptr TP = (tchunkptr)(P); insert_large_chunk(M, TP, S); }

#define unlink_chunk(M, P, S)\
  if (is_small(S)) unlink_small_chunk(M, P, S)\
  else { tchunkptr TP = (tchunkptr)(P); unlink_large_chunk(M, TP); }


/* Relays to internal calls to malloc/free from realloc, memalign etc */

#if ONLY_MSPACES
#define internal_malloc(m, b) mspace_malloc(m, b)
#define internal_free(m, mem) mspace_free(m,mem);
#else /* ONLY_MSPACES */
#if MSPACES
#define internal_malloc(m, b)\
  ((m == gm)? dlmalloc(b) : mspace_malloc(m, b))
#define internal_free(m, mem)\
   if (m == gm) dlfree(mem); else mspace_free(m,mem);
#else /* MSPACES */
#define internal_malloc(m, b) dlmalloc(b)
#define internal_free(m, mem) dlfree(mem)
#endif /* MSPACES */
#endif /* ONLY_MSPACES */

/* -----------------------  Direct-mmapping chunks ----------------------- */

/*
  Directly mmapped chunks are set up with an offset to the start of
  the mmapped region stored in the prev_foot field of the chunk. This
  allows reconstruction of the required argument to MUNMAP when freed,
  and also allows adjustment of the returned chunk to meet alignment
  requirements (especially in memalign).
*/

/* Malloc using mmap */
static void* mmap_alloc(mstate m, size_t nb) {
  size_t mmsize = mmap_align(nb + SIX_SIZE_T_SIZES + CHUNK_ALIGN_MASK);
  if (m->footprint_limit.pload() != 0) {
    size_t fp = m->footprint + mmsize;
    if (fp <= m->footprint || fp > m->footprint_limit)
      return 0;
  }
  if (mmsize > nb) {     /* Check for wrap around 0 */
    char* mm = (char*)(CALL_DIRECT_MMAP(mmsize));
    if (mm != CMFAIL) {
      size_t offset = align_offset(chunk2mem(mm));
      size_t psize = mmsize - offset - MMAP_FOOT_PAD;
      mchunkptr p = (mchunkptr)(mm + offset);
      p->prev_foot = offset;
      p->head = psize;
      mark_inuse_foot(m, p, psize);
      chunk_plus_offset(p, psize)->head = FENCEPOST_HEAD;
      chunk_plus_offset(p, psize+SIZE_T_SIZE)->head = 0;

      if (m->least_addr == 0 || mm < m->least_addr)
        m->least_addr = mm;
      if ((m->footprint += mmsize) > m->max_footprint)
        m->max_footprint = m->footprint;
      assert(is_aligned(chunk2mem(p)));
      check_mmapped_chunk(m, p);
      return chunk2mem(p);
    }
  }
  return 0;
}

/* Realloc using mmap */
static mchunkptr mmap_resize(mstate m, mchunkptr oldp, size_t nb, int flags) {
  size_t oldsize = chunksize(oldp);
  (void)flags; /* placate people compiling -Wunused */
  if (is_small(nb)) /* Can't shrink mmap regions below small size */
    return 0;
  /* Keep old chunk if big enough but not too big */
  if (oldsize >= nb + SIZE_T_SIZE &&
      (oldsize - nb) <= (mparams.granularity << 1))
    return oldp;
  else {
    size_t offset = oldp->prev_foot;
    size_t oldmmsize = oldsize + offset + MMAP_FOOT_PAD;
    size_t newmmsize = mmap_align(nb + SIX_SIZE_T_SIZES + CHUNK_ALIGN_MASK);
    char* cp = (char*)CALL_MREMAP((char*)oldp - offset,
                                  oldmmsize, newmmsize, flags);
    if (cp != CMFAIL) {
      mchunkptr newp = (mchunkptr)(cp + offset);
      size_t psize = newmmsize - offset - MMAP_FOOT_PAD;
      newp->head = psize;
      mark_inuse_foot(m, newp, psize);
      chunk_plus_offset(newp, psize)->head = FENCEPOST_HEAD;
      chunk_plus_offset(newp, psize+SIZE_T_SIZE)->head = 0;

      if (cp < m->least_addr)
        m->least_addr = cp;
      if ((m->footprint += newmmsize - oldmmsize) > m->max_footprint)
        m->max_footprint = m->footprint;
      check_mmapped_chunk(m, newp);
      return newp;
    }
  }
  return 0;
}


/* -------------------------- mspace management -------------------------- */

/* Initialize top chunk and its size */
static void init_top(mstate m, mchunkptr p, size_t psize) {
  /* Ensure alignment */
  size_t offset = align_offset(chunk2mem(p));
  p = (mchunkptr)((char*)p + offset);
  psize -= offset;

  m->top = p;
  m->topsize = psize;
  p->head = psize | PINUSE_BIT;
  /* set size of fake trailing chunk holding overhead space only once */
  chunk_plus_offset(p, psize)->head = TOP_FOOT_SIZE;
  m->trim_check = mparams.trim_threshold; /* reset on each update */
}

/* Initialize bins for a new mstate that is otherwise zeroed out */
static void init_bins(mstate m) {
  /* Establish circular links for smallbins */
  bindex_t i;
  for (i = 0; i < NSMALLBINS; ++i) {
    sbinptr bin = smallbin_at(m,i);
    bin->fd = bin->bk = bin;
  }
}

#if PROCEED_ON_ERROR

/* default corruption action */
static void reset_on_error(mstate m) {
  int i;
  ++malloc_corruption_error_count;
  /* Reinitialize fields to forget about all memory */
  m->smallmap = m->treemap = 0;
  m->dvsize = m->topsize = 0;
  m->seg.base = 0;
  m->seg.size = 0;
  m->seg.next = 0;
  m->top = m->dv = 0;
  for (i = 0; i < NTREEBINS; ++i)
    *treebin_at(m, i) = 0;
  init_bins(m);
}
#endif /* PROCEED_ON_ERROR */

/* Allocate chunk and prepend remainder with chunk in successor base. */
static void* prepend_alloc(mstate m, char* newbase, char* oldbase,
                           size_t nb) {
  mchunkptr p = align_as_chunk(newbase);
  mchunkptr oldfirst = align_as_chunk(oldbase);
  size_t psize = (char*)oldfirst - (char*)p;
  mchunkptr q = chunk_plus_offset(p, nb);
  size_t qsize = psize - nb;
  set_size_and_pinuse_of_inuse_chunk(m, p, nb);

  assert((char*)oldfirst > (char*)q);
  assert(pinuse(oldfirst));
  assert(qsize >= MIN_CHUNK_SIZE);

  /* consolidate remainder with first chunk of old base */
  if (oldfirst == m->top) {
    size_t tsize = m->topsize += qsize;
    m->top = q;
    q->head = tsize | PINUSE_BIT;
    check_top_chunk(m, q);
  }
  else if (oldfirst == m->dv) {
    size_t dsize = m->dvsize += qsize;
    m->dv = q;
    set_size_and_pinuse_of_free_chunk(q, dsize);
  }
  else {
    if (!is_inuse(oldfirst)) {
      size_t nsize = chunksize(oldfirst);
      unlink_chunk(m, oldfirst, nsize);
      oldfirst = chunk_plus_offset(oldfirst, nsize);
      qsize += nsize;
    }
    set_free_with_pinuse(q, qsize, oldfirst);
    insert_chunk(m, q, qsize);
    check_free_chunk(m, q);
  }

  check_malloced_chunk(m, chunk2mem(p), nb);
  return chunk2mem(p);
}

/* Add a segment to hold a new noncontiguous region */
static void add_segment(mstate m, char* tbase, size_t tsize, flag_t mmapped) {
  /* Determine locations and sizes of segment, fenceposts, old top */
  char* old_top = (char*)m->top.pload();
  msegmentptr oldsp = segment_holding(m, old_top);
  char* old_end = oldsp->base + oldsp->size;
  size_t ssize = pad_request(sizeof(struct malloc_segment));
  char* rawsp = old_end - (ssize + FOUR_SIZE_T_SIZES + CHUNK_ALIGN_MASK);
  size_t offset = align_offset(chunk2mem(rawsp));
  char* asp = rawsp + offset;
  char* csp = (asp < (old_top + MIN_CHUNK_SIZE))? old_top : asp;
  mchunkptr sp = (mchunkptr)csp;
  msegmentptr ss = (msegmentptr)(chunk2mem(sp));
  mchunkptr tnext = chunk_plus_offset(sp, ssize);
  mchunkptr p = tnext;
  int nfences = 0;

  /* reset top to new space */
  init_top(m, (mchunkptr)tbase, tsize - TOP_FOOT_SIZE);

  /* Set up segment record */
  assert(is_aligned(ss));
  set_size_and_pinuse_of_inuse_chunk(m, sp, ssize);
  *ss = m->seg; /* Push current record */
  m->seg.base = tbase;
  m->seg.size = tsize;
  m->seg.sflags = mmapped;
  m->seg.next = ss;

  /* Insert trailing fenceposts */
  for (;;) {
    mchunkptr nextp = chunk_plus_offset(p, SIZE_T_SIZE);
    p->head = FENCEPOST_HEAD;
    ++nfences;
    if ((char*)(&(nextp->head)) < old_end)
      p = nextp;
    else
      break;
  }
  assert(nfences >= 2);

  /* Insert the rest of old top into a bin as an ordinary free chunk */
  if (csp != old_top) {
    mchunkptr q = (mchunkptr)old_top;
    size_t psize = csp - old_top;
    mchunkptr tn = chunk_plus_offset(q, psize);
    set_free_with_pinuse(q, psize, tn);
    insert_chunk(m, q, psize);
  }

  check_top_chunk(m, m->top);
}

/* -------------------------- System allocation -------------------------- */

/* Get memory from system using MORECORE or MMAP */
static void* sys_alloc(mstate m, size_t nb) {
  char* tbase = CMFAIL;
  size_t tsize = 0;
  flag_t mmap_flag = 0;
  size_t asize; /* allocation size */

  ensure_initialization();

  /* Directly map large chunks, but only if already initialized */
  if (use_mmap(m) && nb >= mparams.mmap_threshold && m->topsize.pload() != 0) {
    void* mem = mmap_alloc(m, nb);
    if (mem != 0)
      return mem;
  }

  asize = granularity_align(nb + SYS_ALLOC_PADDING);
  if (asize <= nb)
    return 0; /* wraparound */
  if (m->footprint_limit.pload() != 0) {
    size_t fp = m->footprint + asize;
    if (fp <= m->footprint || fp > m->footprint_limit)
      return 0;
  }

  /*
    Try getting memory in any of three ways (in most-preferred to
    least-preferred order):
    1. A call to MORECORE that can normally contiguously extend memory.
       (disabled if not MORECORE_CONTIGUOUS or not HAVE_MORECORE or
       or main space is mmapped or a previous contiguous call failed)
    2. A call to MMAP new space (disabled if not HAVE_MMAP).
       Note that under the default settings, if MORECORE is unable to
       fulfill a request, and HAVE_MMAP is true, then mmap is
       used as a noncontiguous system allocator. This is a useful backup
       strategy for systems with holes in address spaces -- in this case
       sbrk cannot contiguously expand the heap, but mmap may be able to
       find space.
    3. A call to MORECORE that cannot usually contiguously extend memory.
       (disabled if not HAVE_MORECORE)

   In all cases, we need to request enough bytes from system to ensure
   we can malloc nb bytes upon success, so pad with enough space for
   top_foot, plus alignment-pad to make sure we don't lose bytes if
   not on boundary, and round this up to a granularity unit.
  */

  if (MORECORE_CONTIGUOUS && !use_noncontiguous(m)) {
    char* br = CMFAIL;
    size_t ssize = asize; /* sbrk call size */
    msegmentptr ss = (m->top == 0)? 0 : segment_holding(m, (char*)m->top.pload());
    ACQUIRE_MALLOC_GLOBAL_LOCK();

    if (ss == 0) {  /* First time through or recovery */
      char* base = (char*)CALL_MORECORE(0);
      if (base != CMFAIL) {
        size_t fp;
        /* Adjust to end on a page boundary */
        if (!is_page_aligned(base))
          ssize += (page_align((size_t)base) - (size_t)base);
        fp = m->footprint + ssize; /* recheck limits */
        if (ssize > nb && ssize < HALF_MAX_SIZE_T &&
            (m->footprint_limit.pload() == 0 ||
             (fp > m->footprint && fp <= m->footprint_limit)) &&
            (br = (char*)(CALL_MORECORE(ssize))) == base) {
          tbase = base;
          tsize = ssize;
        }
      }
    }
    else {
      /* Subtract out existing available top space from MORECORE request. */
      ssize = granularity_align(nb - m->topsize + SYS_ALLOC_PADDING);
      /* Use mem here only if it did continuously extend old space */
      if (ssize < HALF_MAX_SIZE_T &&
          (br = (char*)(CALL_MORECORE(ssize))) == ss->base+ss->size) {
        tbase = br;
        tsize = ssize;
      }
    }

    if (tbase == CMFAIL) {    /* Cope with partial failure */
      if (br != CMFAIL) {    /* Try to use/extend the space we did get */
        if (ssize < HALF_MAX_SIZE_T &&
            ssize < nb + SYS_ALLOC_PADDING) {
          size_t esize = granularity_align(nb + SYS_ALLOC_PADDING - ssize);
          if (esize < HALF_MAX_SIZE_T) {
            char* end = (char*)CALL_MORECORE(esize);
            if (end != CMFAIL)
              ssize += esize;
            else {            /* Can't use; try to release */
              (void) CALL_MORECORE(-ssize);
              br = CMFAIL;
            }
          }
        }
      }
      if (br != CMFAIL) {    /* Use the space we did get */
        tbase = br;
        tsize = ssize;
      }
      else
        disable_contiguous(m); /* Don't try contiguous path in the future */
    }

    RELEASE_MALLOC_GLOBAL_LOCK();
  }

  if (HAVE_MMAP && tbase == CMFAIL) {  /* Try MMAP */
    char* mp = (char*)(CALL_MMAP(asize));
    if (mp != CMFAIL) {
      tbase = mp;
      tsize = asize;
      mmap_flag = USE_MMAP_BIT;
    }
  }

  if (HAVE_MORECORE && tbase == CMFAIL) { /* Try noncontiguous MORECORE */
    if (asize < HALF_MAX_SIZE_T) {
      char* br = CMFAIL;
      char* end = CMFAIL;
      ACQUIRE_MALLOC_GLOBAL_LOCK();
      br = (char*)(CALL_MORECORE(asize));
      end = (char*)(CALL_MORECORE(0));
      RELEASE_MALLOC_GLOBAL_LOCK();
      if (br != CMFAIL && end != CMFAIL && br < end) {
        size_t ssize = end - br;
        if (ssize > nb + TOP_FOOT_SIZE) {
          tbase = br;
          tsize = ssize;
        }
      }
    }
  }

  if (tbase != CMFAIL) {

    if ((m->footprint += tsize) > m->max_footprint)
      m->max_footprint = m->footprint;

    if (!is_initialized(m)) { /* first-time initialization */
      if (m->least_addr == 0 || tbase < m->least_addr)
        m->least_addr = tbase;
      m->seg.base = tbase;
      m->seg.size = tsize;
      m->seg.sflags = mmap_flag;
      m->magic = mparams.magic;
      m->release_checks = MAX_RELEASE_CHECK_RATE;
      init_bins(m);
#if !ONLY_MSPACES
      if (is_global(m))
        init_top(m, (mchunkptr)tbase, tsize - TOP_FOOT_SIZE);
      else
#endif
      {
        /* Offset top by embedded malloc_state */
        mchunkptr mn = next_chunk(mem2chunk(m));
        init_top(m, mn, (size_t)((tbase + tsize) - (char*)mn) -TOP_FOOT_SIZE);
      }
    }

    else {
      /* Try to merge with an existing segment */
      msegmentptr sp = &m->seg;
      /* Only consider most recent segment if traversal suppressed */
      while (sp != 0 && tbase != sp->base + sp->size)
        sp = (NO_SEGMENT_TRAVERSAL) ? 0 : sp->next;
      if (sp != 0 &&
          !is_extern_segment(sp) &&
          (sp->sflags & USE_MMAP_BIT) == mmap_flag &&
          segment_holds(sp, m->top.pload())) { /* append */
        sp->size += tsize;
        init_top(m, m->top, m->topsize + tsize);
      }
      else {
        if (tbase < m->least_addr)
          m->least_addr = tbase;
        sp = &m->seg;
        while (sp != 0 && sp->base != tbase + tsize)
          sp = (NO_SEGMENT_TRAVERSAL) ? 0 : sp->next;
        if (sp != 0 &&
            !is_extern_segment(sp) &&
            (sp->sflags & USE_MMAP_BIT) == mmap_flag) {
          char* oldbase = sp->base;
          sp->base = tbase;
          sp->size += tsize;
          return prepend_alloc(m, tbase, oldbase, nb);
        }
        else
          add_segment(m, tbase, tsize, mmap_flag);
      }
    }

    if (nb < m->topsize) { /* Allocate from new or extended top space */
      size_t rsize = m->topsize -= nb;
      mchunkptr p = m->top;
      mchunkptr r = m->top = chunk_plus_offset(p, nb);
      r->head = rsize | PINUSE_BIT;
      set_size_and_pinuse_of_inuse_chunk(m, p, nb);
      check_top_chunk(m, m->top);
      check_malloced_chunk(m, chunk2mem(p), nb);
      return chunk2mem(p);
    }
  }

  MALLOC_FAILURE_ACTION;
  return 0;
}

/* -----------------------  system deallocation -------------------------- */

/* Unmap and unlink any mmapped segments that don't contain used chunks */
static size_t release_unused_segments(mstate m) {
  size_t released = 0;
  int nsegs = 0;
  msegmentptr pred = &m->seg;
  msegmentptr sp = pred->next;
  while (sp != 0) {
    char* base = sp->base;
    size_t size = sp->size;
    msegmentptr next = sp->next;
    ++nsegs;
    if (is_mmapped_segment(sp) && !is_extern_segment(sp)) {
      mchunkptr p = align_as_chunk(base);
      size_t psize = chunksize(p);
      /* Can unmap if first chunk holds entire segment and not pinned */
      if (!is_inuse(p) && (char*)p + psize >= base + size - TOP_FOOT_SIZE) {
        tchunkptr tp = (tchunkptr)p;
        assert(segment_holds(sp, (char*)sp));
        if (p == m->dv) {
          m->dv = 0;
          m->dvsize = 0;
        }
        else {
          unlink_large_chunk(m, tp);
        }
        if (CALL_MUNMAP(base, size) == 0) {
          released += size;
          m->footprint -= size;
          /* unlink obsoleted record */
          sp = pred;
          sp->next = next;
        }
        else { /* back out if cannot unmap */
          insert_large_chunk(m, tp, psize);
        }
      }
    }
    if (NO_SEGMENT_TRAVERSAL) /* scan only first segment */
      break;
    pred = sp;
    sp = next;
  }
  /* Reset check counter */
  m->release_checks = (((size_t) nsegs > (size_t) MAX_RELEASE_CHECK_RATE)?
                       (size_t) nsegs : (size_t) MAX_RELEASE_CHECK_RATE);
  return released;
}

static int sys_trim(mstate m, size_t pad) {
  size_t released = 0;
  ensure_initialization();
  if (pad < MAX_REQUEST && is_initialized(m)) {
    pad += TOP_FOOT_SIZE; /* ensure enough room for segment overhead */

    if (m->topsize > pad) {
      /* Shrink top space in granularity-size units, keeping at least one */
      size_t unit = mparams.granularity;
      size_t extra = ((m->topsize - pad + (unit - SIZE_T_ONE)) / unit -
                      SIZE_T_ONE) * unit;
      msegmentptr sp = segment_holding(m, (char*)m->top.pload());

      if (!is_extern_segment(sp)) {
        if (is_mmapped_segment(sp)) {
          if (HAVE_MMAP &&
              sp->size >= extra &&
              !has_segment_link(m, sp)) { /* can't shrink if pinned */
            size_t newsize = sp->size - extra;
            (void)newsize; /* placate people compiling -Wunused-variable */
            /* Prefer mremap, fall back to munmap */
            if ((CALL_MREMAP(sp->base, sp->size, newsize, 0) != MFAIL) ||
                (CALL_MUNMAP(sp->base + newsize, extra) == 0)) {
              released = extra;
            }
          }
        }
        else if (HAVE_MORECORE) {
          if (extra >= HALF_MAX_SIZE_T) /* Avoid wrapping negative */
            extra = (HALF_MAX_SIZE_T) + SIZE_T_ONE - unit;
          ACQUIRE_MALLOC_GLOBAL_LOCK();
          {
            /* Make sure end of memory is where we last set it. */
            char* old_br = (char*)(CALL_MORECORE(0));
            if (old_br == sp->base + sp->size) {
              char* rel_br = (char*)(CALL_MORECORE(-extra));
              char* new_br = (char*)(CALL_MORECORE(0));
              if (rel_br != CMFAIL && new_br < old_br)
                released = old_br - new_br;
            }
          }
          RELEASE_MALLOC_GLOBAL_LOCK();
        }
      }

      if (released != 0) {
        sp->size -= released;
        m->footprint -= released;
        init_top(m, m->top, m->topsize - released);
        check_top_chunk(m, m->top);
      }
    }

    /* Unmap any unused mmapped segments */
    if (HAVE_MMAP)
      released += release_unused_segments(m);

    /* On failure, disable autotrim to avoid repeated failed future calls */
    if (released == 0 && m->topsize > m->trim_check)
      m->trim_check = MAX_SIZE_T;
  }

  return (released != 0)? 1 : 0;
}

/* Consolidate and bin a chunk. Differs from exported versions
   of free mainly in that the chunk need not be marked as inuse.
*/
static void dispose_chunk(mstate m, mchunkptr p, size_t psize) {
  mchunkptr next = chunk_plus_offset(p, psize);
  if (!pinuse(p)) {
    mchunkptr prev;
    size_t prevsize = p->prev_foot;
    if (is_mmapped(p)) {
      psize += prevsize + MMAP_FOOT_PAD;
      if (CALL_MUNMAP((char*)p - prevsize, psize) == 0)
        m->footprint -= psize;
      return;
    }
    prev = chunk_minus_offset(p, prevsize);
    psize += prevsize;
    p = prev;
    if (RTCHECK(ok_address(m, prev))) { /* consolidate backward */
      if (p != m->dv) {
        unlink_chunk(m, p, prevsize);
      }
      else if ((next->head & INUSE_BITS) == INUSE_BITS) {
        m->dvsize = psize;
        set_free_with_pinuse(p, psize, next);
        return;
      }
    }
    else {
      CORRUPTION_ERROR_ACTION(m);
      return;
    }
  }
  if (RTCHECK(ok_address(m, next))) {
    if (!cinuse(next)) {  /* consolidate forward */
      if (next == m->top) {
        size_t tsize = m->topsize += psize;
        m->top = p;
        p->head = tsize | PINUSE_BIT;
        if (p == m->dv) {
          m->dv = 0;
          m->dvsize = 0;
        }
        return;
      }
      else if (next == m->dv) {
        size_t dsize = m->dvsize += psize;
        m->dv = p;
        set_size_and_pinuse_of_free_chunk(p, dsize);
        return;
      }
      else {
        size_t nsize = chunksize(next);
        psize += nsize;
        unlink_chunk(m, next, nsize);
        set_size_and_pinuse_of_free_chunk(p, psize);
        if (p == m->dv) {
          m->dvsize = psize;
          return;
        }
      }
    }
    else {
      set_free_with_pinuse(p, psize, next);
    }
    insert_chunk(m, p, psize);
  }
  else {
    CORRUPTION_ERROR_ACTION(m);
  }
}

/* ---------------------------- malloc --------------------------- */

/* allocate a large request from the best fitting chunk in a treebin */
static void* tmalloc_large(mstate m, size_t nb) {
  tchunkptr v = 0;
  size_t rsize = -nb; /* Unsigned negation */
  tchunkptr t;
  bindex_t idx;
  compute_tree_index(nb, idx);
  if ((t = *treebin_at(m, idx)) != 0) {
    /* Traverse tree for this bin looking for node with size == nb */
    size_t sizebits = nb << leftshift_for_tree_index(idx);
    tchunkptr rst = 0;  /* The deepest untaken right subtree */
    for (;;) {
      tchunkptr rt;
      size_t trem = chunksize(t) - nb;
      if (trem < rsize) {
        v = t;
        if ((rsize = trem) == 0)
          break;
      }
      rt = t->child[1];
      t = t->child[(sizebits >> (SIZE_T_BITSIZE-SIZE_T_ONE)) & 1];
      if (rt != 0 && rt != t)
        rst = rt;
      if (t == 0) {
        t = rst; /* set t to least subtree holding sizes > nb */
        break;
      }
      sizebits <<= 1;
    }
  }
  if (t == 0 && v == 0) { /* set t to root of next non-empty treebin */
    binmap_t leftbits = left_bits(idx2bit(idx)) & m->treemap;
    if (leftbits != 0) {
      bindex_t i;
      binmap_t leastbit = least_bit(leftbits);
      compute_bit2idx(leastbit, i);
      t = *treebin_at(m, i);
    }
  }

  while (t != 0) { /* find smallest of tree or subtree */
    size_t trem = chunksize(t) - nb;
    if (trem < rsize) {
      rsize = trem;
      v = t;
    }
    t = leftmost_child(t);
  }

  /*  If dv is a better fit, return 0 so malloc will use it */
  if (v != 0 && rsize < (size_t)(m->dvsize - nb)) {
    if (RTCHECK(ok_address(m, v))) { /* split */
      mchunkptr r = chunk_plus_offset(v, nb);
      assert(chunksize(v) == rsize + nb);
      if (RTCHECK(ok_next(v, r))) {
        unlink_large_chunk(m, v);
        if (rsize < MIN_CHUNK_SIZE)
          set_inuse_and_pinuse(m, v, (rsize + nb));
        else {
          set_size_and_pinuse_of_inuse_chunk(m, v, nb);
          set_size_and_pinuse_of_free_chunk(r, rsize);
          insert_chunk(m, r, rsize);
        }
        return chunk2mem(v);
      }
    }
    CORRUPTION_ERROR_ACTION(m);
  }
  return 0;
}

/* allocate a small request from the best fitting chunk in a treebin */
static void* tmalloc_small(mstate m, size_t nb) {
  tchunkptr t, v;
  size_t rsize;
  bindex_t i;
  binmap_t leastbit = least_bit(m->treemap);
  compute_bit2idx(leastbit, i);
  v = t = *treebin_at(m, i);
  rsize = chunksize(t) - nb;

  while ((t = leftmost_child(t)) != 0) {
    size_t trem = chunksize(t) - nb;
    if (trem < rsize) {
      rsize = trem;
      v = t;
    }
  }

  if (RTCHECK(ok_address(m, v))) {
    mchunkptr r = chunk_plus_offset(v, nb);
    assert(chunksize(v) == rsize + nb);
    if (RTCHECK(ok_next(v, r))) {
      unlink_large_chunk(m, v);
      if (rsize < MIN_CHUNK_SIZE)
        set_inuse_and_pinuse(m, v, (rsize + nb));
      else {
        set_size_and_pinuse_of_inuse_chunk(m, v, nb);
        set_size_and_pinuse_of_free_chunk(r, rsize);
        replace_dv(m, r, rsize);
      }
      return chunk2mem(v);
    }
  }

  CORRUPTION_ERROR_ACTION(m);
  return 0;
}

#if !ONLY_MSPACES

void* dlmalloc(size_t bytes) {
  /*
     Basic algorithm:
     If a small request (< 256 bytes minus per-chunk overhead):
       1. If one exists, use a remainderless chunk in associated smallbin.
          (Remainderless means that there are too few excess bytes to
          represent as a chunk.)
       2. If it is big enough, use the dv chunk, which is normally the
          chunk adjacent to the one used for the most recent small request.
       3. If one exists, split the smallest available chunk in a bin,
          saving remainder in dv.
       4. If it is big enough, use the top chunk.
       5. If available, get memory from system and use it
     Otherwise, for a large request:
       1. Find the smallest available binned chunk that fits, and use it
          if it is better fitting than dv chunk, splitting if necessary.
       2. If better fitting than any binned chunk, use the dv chunk.
       3. If it is big enough, use the top chunk.
       4. If request size >= mmap threshold, try to directly mmap this chunk.
       5. If available, get memory from system and use it

     The ugly goto's here ensure that postaction occurs along all paths.
  */

#if USE_LOCKS
  ensure_initialization(); /* initialize in sys_alloc if not using locks */
#endif

  if (!PREACTION(gm)) {
    void* mem;
    size_t nb;
    if (bytes <= MAX_SMALL_REQUEST) {
      bindex_t idx;
      binmap_t smallbits;
      nb = (bytes < MIN_REQUEST)? MIN_CHUNK_SIZE : pad_request(bytes);
      idx = small_index(nb);
      smallbits = gm->smallmap >> idx;

      if ((smallbits & 0x3U) != 0) { /* Remainderless fit to a smallbin. */
        mchunkptr b, p;
        idx += ~smallbits & 1;       /* Uses next bin if idx empty */
        b = smallbin_at(gm, idx);
        p = b->fd;
        assert(chunksize(p) == small_index2size(idx));
        unlink_first_small_chunk(gm, b, p, idx);
        set_inuse_and_pinuse(gm, p, small_index2size(idx));
        mem = chunk2mem(p);
        check_malloced_chunk(gm, mem, nb);
        goto postaction;
      }

      else if (nb > gm->dvsize) {
        if (smallbits != 0) { /* Use chunk in next nonempty smallbin */
          mchunkptr b, p, r;
          size_t rsize;
          bindex_t i;
          binmap_t leftbits = (smallbits << idx) & left_bits(idx2bit(idx));
          binmap_t leastbit = least_bit(leftbits);
          compute_bit2idx(leastbit, i);
          b = smallbin_at(gm, i);
          p = b->fd;
          assert(chunksize(p) == small_index2size(i));
          unlink_first_small_chunk(gm, b, p, i);
          rsize = small_index2size(i) - nb;
          /* Fit here cannot be remainderless if 4byte sizes */
          if (SIZE_T_SIZE != 4 && rsize < MIN_CHUNK_SIZE)
            set_inuse_and_pinuse(gm, p, small_index2size(i));
          else {
            set_size_and_pinuse_of_inuse_chunk(gm, p, nb);
            r = chunk_plus_offset(p, nb);
            set_size_and_pinuse_of_free_chunk(r, rsize);
            replace_dv(gm, r, rsize);
          }
          mem = chunk2mem(p);
          check_malloced_chunk(gm, mem, nb);
          goto postaction;
        }

        else if (gm->treemap != 0 && (mem = tmalloc_small(gm, nb)) != 0) {
          check_malloced_chunk(gm, mem, nb);
          goto postaction;
        }
      }
    }
    else if (bytes >= MAX_REQUEST)
      nb = MAX_SIZE_T; /* Too big to allocate. Force failure (in sys alloc) */
    else {
      nb = pad_request(bytes);
      if (gm->treemap != 0 && (mem = tmalloc_large(gm, nb)) != 0) {
        check_malloced_chunk(gm, mem, nb);
        goto postaction;
      }
    }

    if (nb <= gm->dvsize) {
      size_t rsize = gm->dvsize - nb;
      mchunkptr p = gm->dv;
      if (rsize >= MIN_CHUNK_SIZE) { /* split dv */
        mchunkptr r = gm->dv = chunk_plus_offset(p, nb);
        gm->dvsize = rsize;
        set_size_and_pinuse_of_free_chunk(r, rsize);
        set_size_and_pinuse_of_inuse_chunk(gm, p, nb);
      }
      else { /* exhaust dv */
        size_t dvs = gm->dvsize;
        gm->dvsize = 0;
        gm->dv = 0;
        set_inuse_and_pinuse(gm, p, dvs);
      }
      mem = chunk2mem(p);
      check_malloced_chunk(gm, mem, nb);
      goto postaction;
    }

    else if (nb < gm->topsize) { /* Split top */
      size_t rsize = gm->topsize -= nb;
      mchunkptr p = gm->top;
      mchunkptr r = gm->top = chunk_plus_offset(p, nb);
      r->head = rsize | PINUSE_BIT;
      set_size_and_pinuse_of_inuse_chunk(gm, p, nb);
      mem = chunk2mem(p);
      check_top_chunk(gm, gm->top);
      check_malloced_chunk(gm, mem, nb);
      goto postaction;
    }

    mem = sys_alloc(gm, nb);

  postaction:
    POSTACTION(gm);
    return mem;
  }

  return 0;
}

/* ---------------------------- free --------------------------- */

void dlfree(void* mem) {
  /*
     Consolidate freed chunks with preceeding or succeeding bordering
     free chunks, if they exist, and then place in a bin.  Intermixed
     with special cases for top, dv, mmapped chunks, and usage errors.
  */

  if (mem != 0) {
    mchunkptr p  = mem2chunk(mem);
#if FOOTERS
    mstate fm = get_mstate_for(p);
    if (!ok_magic(fm)) {
      USAGE_ERROR_ACTION(fm, p);
      return;
    }
#else /* FOOTERS */
#define fm gm
#endif /* FOOTERS */
    if (!PREACTION(fm)) {
      check_inuse_chunk(fm, p);
      if (RTCHECK(ok_address(fm, p) && ok_inuse(p))) {
        size_t psize = chunksize(p);
        mchunkptr next = chunk_plus_offset(p, psize);
        if (!pinuse(p)) {
          size_t prevsize = p->prev_foot;
          if (is_mmapped(p)) {
            psize += prevsize + MMAP_FOOT_PAD;
            if (CALL_MUNMAP((char*)p - prevsize, psize) == 0)
              fm->footprint -= psize;
            goto postaction;
          }
          else {
            mchunkptr prev = chunk_minus_offset(p, prevsize);
            psize += prevsize;
            p = prev;
            if (RTCHECK(ok_address(fm, prev))) { /* consolidate backward */
              if (p != fm->dv) {
                unlink_chunk(fm, p, prevsize);
              }
              else if ((next->head & INUSE_BITS) == INUSE_BITS) {
                fm->dvsize = psize;
                set_free_with_pinuse(p, psize, next);
                goto postaction;
              }
            }
            else
              goto erroraction;
          }
        }

        if (RTCHECK(ok_next(p, next) && ok_pinuse(next))) {
          if (!cinuse(next)) {  /* consolidate forward */
            if (next == fm->top) {
              size_t tsize = fm->topsize += psize;
              fm->top = p;
              p->head = tsize | PINUSE_BIT;
              if (p == fm->dv) {
                fm->dv = 0;
                fm->dvsize = 0;
              }
              if (should_trim(fm, tsize))
                sys_trim(fm, 0);
              goto postaction;
            }
            else if (next == fm->dv) {
              size_t dsize = fm->dvsize += psize;
              fm->dv = p;
              set_size_and_pinuse_of_free_chunk(p, dsize);
              goto postaction;
            }
            else {
              size_t nsize = chunksize(next);
              psize += nsize;
              unlink_chunk(fm, next, nsize);
              set_size_and_pinuse_of_free_chunk(p, psize);
              if (p == fm->dv) {
                fm->dvsize = psize;
                goto postaction;
              }
            }
          }
          else
            set_free_with_pinuse(p, psize, next);

          if (is_small(psize)) {
            insert_small_chunk(fm, p, psize);
            check_free_chunk(fm, p);
          }
          else {
            tchunkptr tp = (tchunkptr)p;
            insert_large_chunk(fm, tp, psize);
            check_free_chunk(fm, p);
            if (--fm->release_checks == 0)
              release_unused_segments(fm);
          }
          goto postaction;
        }
      }
    erroraction:
      USAGE_ERROR_ACTION(fm, p);
    postaction:
      POSTACTION(fm);
    }
  }
#if !FOOTERS
#undef fm
#endif /* FOOTERS */
}

void* dlcalloc(size_t n_elements, size_t elem_size) {
  void* mem;
  size_t req = 0;
  if (n_elements != 0) {
    req = n_elements * elem_size;
    if (((n_elements | elem_size) & ~(size_t)0xffff) &&
        (req / n_elements != elem_size))
      req = MAX_SIZE_T; /* force downstream failure on overflow */
  }
  mem = dlmalloc(req);
  if (mem != 0 && calloc_must_clear(mem2chunk(mem)))
    memset(mem, 0, req);
  return mem;
}

#endif /* !ONLY_MSPACES */

/* ------------ Internal support for realloc, memalign, etc -------------- */

/* Try to realloc; only in-place unless can_move true */
static mchunkptr try_realloc_chunk(mstate m, mchunkptr p, size_t nb,
                                   int can_move) {
  mchunkptr newp = 0;
  size_t oldsize = chunksize(p);
  mchunkptr next = chunk_plus_offset(p, oldsize);
  if (RTCHECK(ok_address(m, p) && ok_inuse(p) &&
              ok_next(p, next) && ok_pinuse(next))) {
    if (is_mmapped(p)) {
      newp = mmap_resize(m, p, nb, can_move);
    }
    else if (oldsize >= nb) {             /* already big enough */
      size_t rsize = oldsize - nb;
      if (rsize >= MIN_CHUNK_SIZE) {      /* split off remainder */
        mchunkptr r = chunk_plus_offset(p, nb);
        set_inuse(m, p, nb);
        set_inuse(m, r, rsize);
        dispose_chunk(m, r, rsize);
      }
      newp = p;
    }
    else if (next == m->top) {  /* extend into top */
      if (oldsize + m->topsize > nb) {
        size_t newsize = oldsize + m->topsize;
        size_t newtopsize = newsize - nb;
        mchunkptr newtop = chunk_plus_offset(p, nb);
        set_inuse(m, p, nb);
        newtop->head = newtopsize |PINUSE_BIT;
        m->top = newtop;
        m->topsize = newtopsize;
        newp = p;
      }
    }
    else if (next == m->dv) { /* extend into dv */
      size_t dvs = m->dvsize;
      if (oldsize + dvs >= nb) {
        size_t dsize = oldsize + dvs - nb;
        if (dsize >= MIN_CHUNK_SIZE) {
          mchunkptr r = chunk_plus_offset(p, nb);
          mchunkptr n = chunk_plus_offset(r, dsize);
          set_inuse(m, p, nb);
          set_size_and_pinuse_of_free_chunk(r, dsize);
          clear_pinuse(n);
          m->dvsize = dsize;
          m->dv = r;
        }
        else { /* exhaust dv */
          size_t newsize = oldsize + dvs;
          set_inuse(m, p, newsize);
          m->dvsize = 0;
          m->dv = 0;
        }
        newp = p;
      }
    }
    else if (!cinuse(next)) { /* extend into next free chunk */
      size_t nextsize = chunksize(next);
      if (oldsize + nextsize >= nb) {
        size_t rsize = oldsize + nextsize - nb;
        unlink_chunk(m, next, nextsize);
        if (rsize < MIN_CHUNK_SIZE) {
          size_t newsize = oldsize + nextsize;
          set_inuse(m, p, newsize);
        }
        else {
          mchunkptr r = chunk_plus_offset(p, nb);
          set_inuse(m, p, nb);
          set_inuse(m, r, rsize);
          dispose_chunk(m, r, rsize);
        }
        newp = p;
      }
    }
  }
  else {
    USAGE_ERROR_ACTION(m, chunk2mem(p));
  }
  return newp;
}

static void* internal_memalign(mstate m, size_t alignment, size_t bytes) {
  void* mem = 0;
  if (alignment <  MIN_CHUNK_SIZE) /* must be at least a minimum chunk size */
    alignment = MIN_CHUNK_SIZE;
  if ((alignment & (alignment-SIZE_T_ONE)) != 0) {/* Ensure a power of 2 */
    size_t a = MALLOC_ALIGNMENT << 1;
    while (a < alignment) a <<= 1;
    alignment = a;
  }
  if (bytes >= MAX_REQUEST - alignment) {
    if (m != 0)  { /* Test isn't needed but avoids compiler warning */
      MALLOC_FAILURE_ACTION;
    }
  }
  else {
    size_t nb = request2size(bytes);
    size_t req = nb + alignment + MIN_CHUNK_SIZE - CHUNK_OVERHEAD;
    mem = internal_malloc(m, req);
    if (mem != 0) {
      mchunkptr p = mem2chunk(mem);
      if (PREACTION(m))
        return 0;
      if ((((size_t)(mem)) & (alignment - 1)) != 0) { /* misaligned */
        /*
          Find an aligned spot inside chunk.  Since we need to give
          back leading space in a chunk of at least MIN_CHUNK_SIZE, if
          the first calculation places us at a spot with less than
          MIN_CHUNK_SIZE leader, we can move to the next aligned spot.
          We've allocated enough total room so that this is always
          possible.
        */
        char* br = (char*)mem2chunk((size_t)(((size_t)((char*)mem + alignment -
                                                       SIZE_T_ONE)) &
                                             -alignment));
        char* pos = ((size_t)(br - (char*)(p)) >= MIN_CHUNK_SIZE)?
          br : br+alignment;
        mchunkptr newp = (mchunkptr)pos;
        size_t leadsize = pos - (char*)(p);
        size_t newsize = chunksize(p) - leadsize;

        if (is_mmapped(p)) { /* For mmapped chunks, just adjust offset */
          newp->prev_foot = p->prev_foot + leadsize;
          newp->head = newsize;
        }
        else { /* Otherwise, give back leader, use the rest */
          set_inuse(m, newp, newsize);
          set_inuse(m, p, leadsize);
          dispose_chunk(m, p, leadsize);
        }
        p = newp;
      }

      /* Give back spare room at the end */
      if (!is_mmapped(p)) {
        size_t size = chunksize(p);
        if (size > nb + MIN_CHUNK_SIZE) {
          size_t remainder_size = size - nb;
          mchunkptr remainder = chunk_plus_offset(p, nb);
          set_inuse(m, p, nb);
          set_inuse(m, remainder, remainder_size);
          dispose_chunk(m, remainder, remainder_size);
        }
      }

      mem = chunk2mem(p);
      assert (chunksize(p) >= nb);
      assert(((size_t)mem & (alignment - 1)) == 0);
      check_inuse_chunk(m, p);
      POSTACTION(m);
    }
  }
  return mem;
}

/*
  Common support for independent_X routines, handling
    all of the combinations that can result.
  The opts arg has:
    bit 0 set if all elements are same size (using sizes[0])
    bit 1 set if elements should be zeroed
*/
static void** ialloc(mstate m,
                     size_t n_elements,
                     size_t* sizes,
                     int opts,
                     void* chunks[]) {

  size_t    element_size;   /* chunksize of each element, if all same */
  size_t    contents_size;  /* total size of elements */
  size_t    array_size;     /* request size of pointer array */
  void*     mem;            /* malloced aggregate space */
  mchunkptr p;              /* corresponding chunk */
  size_t    remainder_size; /* remaining bytes while splitting */
  void**    marray;         /* either "chunks" or malloced ptr array */
  mchunkptr array_chunk;    /* chunk for malloced ptr array */
  flag_t    was_enabled;    /* to disable mmap */
  size_t    size;
  size_t    i;

  ensure_initialization();
  /* compute array length, if needed */
  if (chunks != 0) {
    if (n_elements == 0)
      return chunks; /* nothing to do */
    marray = chunks;
    array_size = 0;
  }
  else {
    /* if empty req, must still return chunk representing empty array */
    if (n_elements == 0)
      return (void**)internal_malloc(m, 0);
    marray = 0;
    array_size = request2size(n_elements * (sizeof(void*)));
  }

  /* compute total element size */
  if (opts & 0x1) { /* all-same-size */
    element_size = request2size(*sizes);
    contents_size = n_elements * element_size;
  }
  else { /* add up all the sizes */
    element_size = 0;
    contents_size = 0;
    for (i = 0; i != n_elements; ++i)
      contents_size += request2size(sizes[i]);
  }

  size = contents_size + array_size;

  /*
     Allocate the aggregate chunk.  First disable direct-mmapping so
     malloc won't use it, since we would not be able to later
     free/realloc space internal to a segregated mmap region.
  */
  was_enabled = use_mmap(m);
  disable_mmap(m);
  mem = internal_malloc(m, size - CHUNK_OVERHEAD);
  if (was_enabled)
    enable_mmap(m);
  if (mem == 0)
    return 0;

  if (PREACTION(m)) return 0;
  p = mem2chunk(mem);
  remainder_size = chunksize(p);

  assert(!is_mmapped(p));

  if (opts & 0x2) {       /* optionally clear the elements */
    memset((size_t*)mem, 0, remainder_size - SIZE_T_SIZE - array_size);
  }

  /* If not provided, allocate the pointer array as final part of chunk */
  if (marray == 0) {
    size_t  array_chunk_size;
    array_chunk = chunk_plus_offset(p, contents_size);
    array_chunk_size = remainder_size - contents_size;
    marray = (void**) (chunk2mem(array_chunk));
    set_size_and_pinuse_of_inuse_chunk(m, array_chunk, array_chunk_size);
    remainder_size = contents_size;
  }

  /* split out elements */
  for (i = 0; ; ++i) {
    marray[i] = chunk2mem(p);
    if (i != n_elements-1) {
      if (element_size != 0)
        size = element_size;
      else
        size = request2size(sizes[i]);
      remainder_size -= size;
      set_size_and_pinuse_of_inuse_chunk(m, p, size);
      p = chunk_plus_offset(p, size);
    }
    else { /* the final element absorbs any overallocation slop */
      set_size_and_pinuse_of_inuse_chunk(m, p, remainder_size);
      break;
    }
  }

#if DEBUG
  if (marray != chunks) {
    /* final element must have exactly exhausted chunk */
    if (element_size != 0) {
      assert(remainder_size == element_size);
    }
    else {
      assert(remainder_size == request2size(sizes[i]));
    }
    check_inuse_chunk(m, mem2chunk(marray));
  }
  for (i = 0; i != n_elements; ++i)
    check_inuse_chunk(m, mem2chunk(marray[i]));

#endif /* DEBUG */

  POSTACTION(m);
  return marray;
}

/* Try to free all pointers in the given array.
   Note: this could be made faster, by delaying consolidation,
   at the price of disabling some user integrity checks, We
   still optimize some consolidations by combining adjacent
   chunks before freeing, which will occur often if allocated
   with ialloc or the array is sorted.
*/
static size_t internal_bulk_free(mstate m, void* array[], size_t nelem) {
  size_t unfreed = 0;
  if (!PREACTION(m)) {
    persist<void*>* a;
    persist<void*>* fence = (persist<void*>*)&(array[nelem]);
    for (a = (persist<void*>*)array; a != fence; ++a) {
      void* mem = *a;
      if (mem != 0) {
        mchunkptr p = mem2chunk(mem);
        size_t psize = chunksize(p);
#if FOOTERS
        if (get_mstate_for(p) != m) {
          ++unfreed;
          continue;
        }
#endif
        check_inuse_chunk(m, p);
        *a = 0;
        if (RTCHECK(ok_address(m, p) && ok_inuse(p))) {
          persist<void *>* b = a + 1; /* try to merge with next chunk */
          mchunkptr next = next_chunk(p);
          if (b != fence && *b == chunk2mem(next)) {
            size_t newsize = chunksize(next) + psize;
            set_inuse(m, p, newsize);
            *b = chunk2mem(p);
          }
          else
            dispose_chunk(m, p, psize);
        }
        else {
          CORRUPTION_ERROR_ACTION(m);
          break;
        }
      }
    }
    if (should_trim(m, m->topsize))
      sys_trim(m, 0);
    POSTACTION(m);
  }
  return unfreed;
}

/* Traversal */
#if MALLOC_INSPECT_ALL
static void internal_inspect_all(mstate m,
                                 void(*handler)(void *start,
                                                void *end,
                                                size_t used_bytes,
                                                void* callback_arg),
                                 void* arg) {
  if (is_initialized(m)) {
    mchunkptr top = m->top;
    msegmentptr s;
    for (s = &m->seg; s != 0; s = s->next) {
      mchunkptr q = align_as_chunk(s->base);
      while (segment_holds(s, q) && q->head != FENCEPOST_HEAD) {
        mchunkptr next = next_chunk(q);
        size_t sz = chunksize(q);
        size_t used;
        void* start;
        if (is_inuse(q)) {
          used = sz - CHUNK_OVERHEAD; /* must not be mmapped */
          start = chunk2mem(q);
        }
        else {
          used = 0;
          if (is_small(sz)) {     /* offset by possible bookkeeping */
            start = (void*)((char*)q + sizeof(struct malloc_chunk));
          }
          else {
            start = (void*)((char*)q + sizeof(struct malloc_tree_chunk));
          }
        }
        if (start < (void*)next)  /* skip if all space is bookkeeping */
          handler(start, next, used, arg);
        if (q == top)
          break;
        q = next;
      }
    }
  }
}
#endif /* MALLOC_INSPECT_ALL */

/* ------------------ Exported realloc, memalign, etc -------------------- */

#if !ONLY_MSPACES

void* dlrealloc(void* oldmem, size_t bytes) {
  void* mem = 0;
  if (oldmem == 0) {
    mem = dlmalloc(bytes);
  }
  else if (bytes >= MAX_REQUEST) {
    MALLOC_FAILURE_ACTION;
  }
#ifdef REALLOC_ZERO_BYTES_FREES
  else if (bytes == 0) {
    dlfree(oldmem);
  }
#endif /* REALLOC_ZERO_BYTES_FREES */
  else {
    size_t nb = request2size(bytes);
    mchunkptr oldp = mem2chunk(oldmem);
#if ! FOOTERS
    mstate m = gm;
#else /* FOOTERS */
    mstate m = get_mstate_for(oldp);
    if (!ok_magic(m)) {
      USAGE_ERROR_ACTION(m, oldmem);
      return 0;
    }
#endif /* FOOTERS */
    if (!PREACTION(m)) {
      mchunkptr newp = try_realloc_chunk(m, oldp, nb, 1);
      POSTACTION(m);
      if (newp != 0) {
        check_inuse_chunk(m, newp);
        mem = chunk2mem(newp);
      }
      else {
        mem = internal_malloc(m, bytes);
        if (mem != 0) {
          size_t oc = chunksize(oldp) - overhead_for(oldp);
          memcpy(mem, oldmem, (oc < bytes)? oc : bytes);
          internal_free(m, oldmem);
        }
      }
    }
  }
  return mem;
}

void* dlrealloc_in_place(void* oldmem, size_t bytes) {
  void* mem = 0;
  if (oldmem != 0) {
    if (bytes >= MAX_REQUEST) {
      MALLOC_FAILURE_ACTION;
    }
    else {
      size_t nb = request2size(bytes);
      mchunkptr oldp = mem2chunk(oldmem);
#if ! FOOTERS
      mstate m = gm;
#else /* FOOTERS */
      mstate m = get_mstate_for(oldp);
      if (!ok_magic(m)) {
        USAGE_ERROR_ACTION(m, oldmem);
        return 0;
      }
#endif /* FOOTERS */
      if (!PREACTION(m)) {
        mchunkptr newp = try_realloc_chunk(m, oldp, nb, 0);
        POSTACTION(m);
        if (newp == oldp) {
          check_inuse_chunk(m, newp);
          mem = oldmem;
        }
      }
    }
  }
  return mem;
}

void* dlmemalign(size_t alignment, size_t bytes) {
  if (alignment <= MALLOC_ALIGNMENT) {
    return dlmalloc(bytes);
  }
  return internal_memalign(gm, alignment, bytes);
}

int dlposix_memalign(void** pp, size_t alignment, size_t bytes) {
  void* mem = 0;
  if (alignment == MALLOC_ALIGNMENT)
    mem = dlmalloc(bytes);
  else {
    size_t d = alignment / sizeof(void*);
    size_t r = alignment % sizeof(void*);
    if (r != 0 || d == 0 || (d & (d-SIZE_T_ONE)) != 0)
      return EINVAL;
    else if (bytes <= MAX_REQUEST - alignment) {
      if (alignment <  MIN_CHUNK_SIZE)
        alignment = MIN_CHUNK_SIZE;
      mem = internal_memalign(gm, alignment, bytes);
    }
  }
  if (mem == 0)
    return ENOMEM;
  else {
    *pp = mem;
    return 0;
  }
}

void* dlvalloc(size_t bytes) {
  size_t pagesz;
  ensure_initialization();
  pagesz = mparams.page_size;
  return dlmemalign(pagesz, bytes);
}

void* dlpvalloc(size_t bytes) {
  size_t pagesz;
  ensure_initialization();
  pagesz = mparams.page_size;
  return dlmemalign(pagesz, (bytes + pagesz - SIZE_T_ONE) & ~(pagesz - SIZE_T_ONE));
}

void** dlindependent_calloc(size_t n_elements, size_t elem_size,
                            void* chunks[]) {
  size_t sz = elem_size; /* serves as 1-element array */
  return ialloc(gm, n_elements, &sz, 3, chunks);
}

void** dlindependent_comalloc(size_t n_elements, size_t sizes[],
                              void* chunks[]) {
  return ialloc(gm, n_elements, sizes, 0, chunks);
}

size_t dlbulk_free(void* array[], size_t nelem) {
  return internal_bulk_free(gm, array, nelem);
}

#if MALLOC_INSPECT_ALL
void dlmalloc_inspect_all(void(*handler)(void *start,
                                         void *end,
                                         size_t used_bytes,
                                         void* callback_arg),
                          void* arg) {
  ensure_initialization();
  if (!PREACTION(gm)) {
    internal_inspect_all(gm, handler, arg);
    POSTACTION(gm);
  }
}
#endif /* MALLOC_INSPECT_ALL */

int dlmalloc_trim(size_t pad) {
  int result = 0;
  ensure_initialization();
  if (!PREACTION(gm)) {
    result = sys_trim(gm, pad);
    POSTACTION(gm);
  }
  return result;
}

size_t dlmalloc_footprint(void) {
  return gm->footprint;
}

size_t dlmalloc_max_footprint(void) {
  return gm->max_footprint;
}

size_t dlmalloc_footprint_limit(void) {
  size_t maf = gm->footprint_limit;
  return maf == 0 ? MAX_SIZE_T : maf;
}

size_t dlmalloc_set_footprint_limit(size_t bytes) {
  size_t result;  /* invert sense of 0 */
  if (bytes == 0)
    result = granularity_align(1); /* Use minimal size */
  if (bytes == MAX_SIZE_T)
    result = 0;                    /* disable */
  else
    result = granularity_align(bytes);
  return gm->footprint_limit = result;
}

#if !NO_MALLINFO
struct mallinfo dlmallinfo(void) {
  return internal_mallinfo(gm);
}
#endif /* NO_MALLINFO */

#if !NO_MALLOC_STATS
void dlmalloc_stats() {
  internal_malloc_stats(gm);
}
#endif /* NO_MALLOC_STATS */

int dlmallopt(int param_number, int value) {
  return change_mparam(param_number, value);
}

size_t dlmalloc_usable_size(void* mem) {
  if (mem != 0) {
    mchunkptr p = mem2chunk(mem);
    if (is_inuse(p))
      return chunksize(p) - overhead_for(p);
  }
  return 0;
}

#endif /* !ONLY_MSPACES */

/* ----------------------------- user mspaces ---------------------------- */

#if MSPACES

static mstate init_user_mstate(char* tbase, size_t tsize) {
  size_t msize = pad_request(sizeof(struct malloc_state));
  mchunkptr mn;
  mchunkptr msp = align_as_chunk(tbase);
  mstate m = (mstate)(chunk2mem(msp));
  memset(m, 0, msize);
  (void)INITIAL_LOCK(&m->mutex);
  msp->head = (msize|INUSE_BITS);
  m->seg.base = m->least_addr = tbase;
  m->seg.size = m->footprint = m->max_footprint = tsize;
  m->magic = mparams.magic;
  m->release_checks = MAX_RELEASE_CHECK_RATE;
  m->mflags = mparams.default_mflags;
  m->extp = 0;
  m->exts = 0;
  disable_contiguous(m);
  init_bins(m);
  mn = next_chunk(mem2chunk(m));
  init_top(m, mn, (size_t)((tbase + tsize) - (char*)mn) - TOP_FOOT_SIZE);
  check_top_chunk(m, m->top);
  return m;
}

mspace create_mspace(size_t capacity, int locked) {
  mstate m = 0;
  size_t msize;
  ensure_initialization();
  msize = pad_request(sizeof(struct malloc_state));
  if (capacity < (size_t) -(msize + TOP_FOOT_SIZE + mparams.page_size)) {
    size_t rs = ((capacity == 0)? mparams.granularity :
                 (capacity + TOP_FOOT_SIZE + msize));
    size_t tsize = granularity_align(rs);
    char* tbase = (char*)(CALL_MMAP(tsize));
    if (tbase != CMFAIL) {
      m = init_user_mstate(tbase, tsize);
      m->seg.sflags = USE_MMAP_BIT;
      set_lock(m, locked);
    }
  }
  return (mspace)m;
}

mspace create_mspace_with_base(void* base, size_t capacity, int locked) {
  mstate m = 0;
  size_t msize;
  ensure_initialization();
  msize = pad_request(sizeof(struct malloc_state));
  if (capacity > msize + TOP_FOOT_SIZE &&
      capacity < (size_t) -(msize + TOP_FOOT_SIZE + mparams.page_size)) {
    m = init_user_mstate((char*)base, capacity);
    m->seg.sflags = EXTERN_BIT;
    set_lock(m, locked);
  }
  return (mspace)m;
}

int mspace_track_large_chunks(mspace msp, int enable) {
  int ret = 0;
  mstate ms = (mstate)msp;
  if (!PREACTION(ms)) {
    if (!use_mmap(ms)) {
      ret = 1;
    }
    if (!enable) {
      enable_mmap(ms);
    } else {
      disable_mmap(ms);
    }
    POSTACTION(ms);
  }
  return ret;
}

size_t destroy_mspace(mspace msp) {
  size_t freed = 0;
  mstate ms = (mstate)msp;
  if (ok_magic(ms)) {
    msegmentptr sp = &ms->seg;
    (void)DESTROY_LOCK(&ms->mutex); /* destroy before unmapped */
    while (sp != 0) {
      char* base = sp->base;
      size_t size = sp->size;
      flag_t flag = sp->sflags;
      (void)base; /* placate people compiling -Wunused-variable */
      sp = sp->next;
      if ((flag & USE_MMAP_BIT) && !(flag & EXTERN_BIT) &&
          CALL_MUNMAP(base, size) == 0)
        freed += size;
    }
  }
  else {
    USAGE_ERROR_ACTION(ms,ms);
  }
  return freed;
}

/*
  mspace versions of routines are near-clones of the global
  versions. This is not so nice but better than the alternatives.
*/

void* mspace_malloc(mspace msp, size_t bytes) {
  mstate ms = (mstate)msp;
  if (!ok_magic(ms)) {
    USAGE_ERROR_ACTION(ms,ms);
    return 0;
  }
  if (!PREACTION(ms)) {
    void* mem;
    size_t nb;
    if (bytes <= MAX_SMALL_REQUEST) {
      bindex_t idx;
      binmap_t smallbits;
      nb = (bytes < MIN_REQUEST)? MIN_CHUNK_SIZE : pad_request(bytes);
      idx = small_index(nb);
      smallbits = ms->smallmap >> idx;

      if ((smallbits & 0x3U) != 0) { /* Remainderless fit to a smallbin. */
        mchunkptr b, p;
        idx += ~smallbits & 1;       /* Uses next bin if idx empty */
        b = smallbin_at(ms, idx);
        p = b->fd;
        assert(chunksize(p) == small_index2size(idx));
        unlink_first_small_chunk(ms, b, p, idx);
        set_inuse_and_pinuse(ms, p, small_index2size(idx));
        mem = chunk2mem(p);
        check_malloced_chunk(ms, mem, nb);
        goto postaction;
      }

      else if (nb > ms->dvsize) {
        if (smallbits != 0) { /* Use chunk in next nonempty smallbin */
          mchunkptr b, p, r;
          size_t rsize;
          bindex_t i;
          binmap_t leftbits = (smallbits << idx) & left_bits(idx2bit(idx));
          binmap_t leastbit = least_bit(leftbits);
          compute_bit2idx(leastbit, i);
          b = smallbin_at(ms, i);
          p = b->fd;
          assert(chunksize(p) == small_index2size(i));
          unlink_first_small_chunk(ms, b, p, i);
          rsize = small_index2size(i) - nb;
          /* Fit here cannot be remainderless if 4byte sizes */
          if (SIZE_T_SIZE != 4 && rsize < MIN_CHUNK_SIZE)
            set_inuse_and_pinuse(ms, p, small_index2size(i));
          else {
            set_size_and_pinuse_of_inuse_chunk(ms, p, nb);
            r = chunk_plus_offset(p, nb);
            set_size_and_pinuse_of_free_chunk(r, rsize);
            replace_dv(ms, r, rsize);
          }
          mem = chunk2mem(p);
          check_malloced_chunk(ms, mem, nb);
          goto postaction;
        }

        else if (ms->treemap.pload() != 0 && (mem = tmalloc_small(ms, nb)) != 0) {
          check_malloced_chunk(ms, mem, nb);
          goto postaction;
        }
      }
    }
    else if (bytes >= MAX_REQUEST)
      nb = MAX_SIZE_T; /* Too big to allocate. Force failure (in sys alloc) */
    else {
      nb = pad_request(bytes);
      if (ms->treemap.pload() != 0 && (mem = tmalloc_large(ms, nb)) != 0) {
        check_malloced_chunk(ms, mem, nb);
        goto postaction;
      }
    }

    if (nb <= ms->dvsize) {
      size_t rsize = ms->dvsize - nb;
      mchunkptr p = ms->dv;
      if (rsize >= MIN_CHUNK_SIZE) { /* split dv */
        mchunkptr r = ms->dv = chunk_plus_offset(p, nb);
        ms->dvsize = rsize;
        set_size_and_pinuse_of_free_chunk(r, rsize);
        set_size_and_pinuse_of_inuse_chunk(ms, p, nb);
      }
      else { /* exhaust dv */
        size_t dvs = ms->dvsize;
        ms->dvsize = 0;
        ms->dv = 0;
        set_inuse_and_pinuse(ms, p, dvs);
      }
      mem = chunk2mem(p);
      check_malloced_chunk(ms, mem, nb);
      goto postaction;
    }

    else if (nb < ms->topsize) { /* Split top */
      size_t rsize = ms->topsize -= nb;
      mchunkptr p = ms->top;
      mchunkptr r = ms->top = chunk_plus_offset(p, nb);
      r->head = rsize | PINUSE_BIT;
      set_size_and_pinuse_of_inuse_chunk(ms, p, nb);
      mem = chunk2mem(p);
      check_top_chunk(ms, ms->top);
      check_malloced_chunk(ms, mem, nb);
      goto postaction;
    }
    mem = 0;
    //mem = sys_alloc(ms, nb);

  postaction:
    POSTACTION(ms);
    return mem;
  }

  return 0;
}

void mspace_free(mspace msp, void* mem) {
  if (mem != 0) {
    mchunkptr p  = mem2chunk(mem);
#if FOOTERS
    mstate fm = get_mstate_for(p);
    (void)msp; /* placate people compiling -Wunused */
#else /* FOOTERS */
    mstate fm = (mstate)msp;
#endif /* FOOTERS */
    if (!ok_magic(fm)) {
      USAGE_ERROR_ACTION(fm, p);
      return;
    }
    if (!PREACTION(fm)) {
      check_inuse_chunk(fm, p);
      if (RTCHECK(ok_address(fm, p) && ok_inuse(p))) {
        size_t psize = chunksize(p);
        mchunkptr next = chunk_plus_offset(p, psize);
        if (!pinuse(p)) {
          size_t prevsize = p->prev_foot;
          if (is_mmapped(p)) {
            psize += prevsize + MMAP_FOOT_PAD;
            if (CALL_MUNMAP((char*)p - prevsize, psize) == 0)
              fm->footprint -= psize;
            goto postaction;
          }
          else {
            mchunkptr prev = chunk_minus_offset(p, prevsize);
            psize += prevsize;
            p = prev;
            if (RTCHECK(ok_address(fm, prev))) { /* consolidate backward */
              if (p != fm->dv) {
                unlink_chunk(fm, p, prevsize);
              }
              else if ((next->head & INUSE_BITS) == INUSE_BITS) {
                fm->dvsize = psize;
                set_free_with_pinuse(p, psize, next);
                goto postaction;
              }
            }
            else
              goto erroraction;
          }
        }

        if (RTCHECK(ok_next(p, next) && ok_pinuse(next))) {
          if (!cinuse(next)) {  /* consolidate forward */
            if (next == fm->top) {
              size_t tsize = fm->topsize += psize;
              fm->top = p;
              p->head = tsize | PINUSE_BIT;
              if (p == fm->dv) {
                fm->dv = 0;
                fm->dvsize = 0;
              }
              if (should_trim(fm, tsize))
                sys_trim(fm, 0);
              goto postaction;
            }
            else if (next == fm->dv) {
              size_t dsize = fm->dvsize += psize;
              fm->dv = p;
              set_size_and_pinuse_of_free_chunk(p, dsize);
              goto postaction;
            }
            else {
              size_t nsize = chunksize(next);
              psize += nsize;
              unlink_chunk(fm, next, nsize);
              set_size_and_pinuse_of_free_chunk(p, psize);
              if (p == fm->dv) {
                fm->dvsize = psize;
                goto postaction;
              }
            }
          }
          else
            set_free_with_pinuse(p, psize, next);

          if (is_small(psize)) {
            insert_small_chunk(fm, p, psize);
            check_free_chunk(fm, p);
          }
          else {
            tchunkptr tp = (tchunkptr)p;
            insert_large_chunk(fm, tp, psize);
            check_free_chunk(fm, p);
            fm->release_checks.pstore(fm->release_checks.pload()-1);
            if (fm->release_checks.pload()== 0)
              release_unused_segments(fm);
          }
          goto postaction;
        }
      }
    erroraction:
      USAGE_ERROR_ACTION(fm, p);
    postaction:
      POSTACTION(fm);
    }
  }
}

void* mspace_calloc(mspace msp, size_t n_elements, size_t elem_size) {
  void* mem;
  size_t req = 0;
  mstate ms = (mstate)msp;
  if (!ok_magic(ms)) {
    USAGE_ERROR_ACTION(ms,ms);
    return 0;
  }
  if (n_elements != 0) {
    req = n_elements * elem_size;
    if (((n_elements | elem_size) & ~(size_t)0xffff) &&
        (req / n_elements != elem_size))
      req = MAX_SIZE_T; /* force downstream failure on overflow */
  }
  mem = internal_malloc(ms, req);
  if (mem != 0 && calloc_must_clear(mem2chunk(mem)))
    memset(mem, 0, req);
  return mem;
}

void* mspace_realloc(mspace msp, void* oldmem, size_t bytes) {
  void* mem = 0;
  if (oldmem == 0) {
    mem = mspace_malloc(msp, bytes);
  }
  else if (bytes >= MAX_REQUEST) {
    MALLOC_FAILURE_ACTION;
  }
#ifdef REALLOC_ZERO_BYTES_FREES
  else if (bytes == 0) {
    mspace_free(msp, oldmem);
  }
#endif /* REALLOC_ZERO_BYTES_FREES */
  else {
    size_t nb = request2size(bytes);
    mchunkptr oldp = mem2chunk(oldmem);
#if ! FOOTERS
    mstate m = (mstate)msp;
#else /* FOOTERS */
    mstate m = get_mstate_for(oldp);
    if (!ok_magic(m)) {
      USAGE_ERROR_ACTION(m, oldmem);
      return 0;
    }
#endif /* FOOTERS */
    if (!PREACTION(m)) {
      mchunkptr newp = try_realloc_chunk(m, oldp, nb, 1);
      POSTACTION(m);
      if (newp != 0) {
        check_inuse_chunk(m, newp);
        mem = chunk2mem(newp);
      }
      else {
        mem = mspace_malloc(m, bytes);
        if (mem != 0) {
          size_t oc = chunksize(oldp) - overhead_for(oldp);
          memcpy(mem, oldmem, (oc < bytes)? oc : bytes);
          mspace_free(m, oldmem);
        }
      }
    }
  }
  return mem;
}

void* mspace_realloc_in_place(mspace msp, void* oldmem, size_t bytes) {
  void* mem = 0;
  if (oldmem != 0) {
    if (bytes >= MAX_REQUEST) {
      MALLOC_FAILURE_ACTION;
    }
    else {
      size_t nb = request2size(bytes);
      mchunkptr oldp = mem2chunk(oldmem);
#if ! FOOTERS
      mstate m = (mstate)msp;
#else /* FOOTERS */
      mstate m = get_mstate_for(oldp);
      (void)msp; /* placate people compiling -Wunused */
      if (!ok_magic(m)) {
        USAGE_ERROR_ACTION(m, oldmem);
        return 0;
      }
#endif /* FOOTERS */
      if (!PREACTION(m)) {
        mchunkptr newp = try_realloc_chunk(m, oldp, nb, 0);
        POSTACTION(m);
        if (newp == oldp) {
          check_inuse_chunk(m, newp);
          mem = oldmem;
        }
      }
    }
  }
  return mem;
}

void* mspace_memalign(mspace msp, size_t alignment, size_t bytes) {
  mstate ms = (mstate)msp;
  if (!ok_magic(ms)) {
    USAGE_ERROR_ACTION(ms,ms);
    return 0;
  }
  if (alignment <= MALLOC_ALIGNMENT)
    return mspace_malloc(msp, bytes);
  return internal_memalign(ms, alignment, bytes);
}

void** mspace_independent_calloc(mspace msp, size_t n_elements,
                                 size_t elem_size, void* chunks[]) {
  size_t sz = elem_size; /* serves as 1-element array */
  mstate ms = (mstate)msp;
  if (!ok_magic(ms)) {
    USAGE_ERROR_ACTION(ms,ms);
    return 0;
  }
  return ialloc(ms, n_elements, &sz, 3, chunks);
}

void** mspace_independent_comalloc(mspace msp, size_t n_elements,
                                   size_t sizes[], void* chunks[]) {
  mstate ms = (mstate)msp;
  if (!ok_magic(ms)) {
    USAGE_ERROR_ACTION(ms,ms);
    return 0;
  }
  return ialloc(ms, n_elements, sizes, 0, chunks);
}

size_t mspace_bulk_free(mspace msp, void* array[], size_t nelem) {
  return internal_bulk_free((mstate)msp, array, nelem);
}

#if MALLOC_INSPECT_ALL
void mspace_inspect_all(mspace msp,
                        void(*handler)(void *start,
                                       void *end,
                                       size_t used_bytes,
                                       void* callback_arg),
                        void* arg) {
  mstate ms = (mstate)msp;
  if (ok_magic(ms)) {
    if (!PREACTION(ms)) {
      internal_inspect_all(ms, handler, arg);
      POSTACTION(ms);
    }
  }
  else {
    USAGE_ERROR_ACTION(ms,ms);
  }
}
#endif /* MALLOC_INSPECT_ALL */

int mspace_trim(mspace msp, size_t pad) {
  int result = 0;
  mstate ms = (mstate)msp;
  if (ok_magic(ms)) {
    if (!PREACTION(ms)) {
      result = sys_trim(ms, pad);
      POSTACTION(ms);
    }
  }
  else {
    USAGE_ERROR_ACTION(ms,ms);
  }
  return result;
}

#if !NO_MALLOC_STATS
void mspace_malloc_stats(mspace msp) {
  mstate ms = (mstate)msp;
  if (ok_magic(ms)) {
    internal_malloc_stats(ms);
  }
  else {
    USAGE_ERROR_ACTION(ms,ms);
  }
}
#endif /* NO_MALLOC_STATS */

size_t mspace_footprint(mspace msp) {
  size_t result = 0;
  mstate ms = (mstate)msp;
  if (ok_magic(ms)) {
    result = ms->footprint;
  }
  else {
    USAGE_ERROR_ACTION(ms,ms);
  }
  return result;
}

size_t mspace_max_footprint(mspace msp) {
  size_t result = 0;
  mstate ms = (mstate)msp;
  if (ok_magic(ms)) {
    result = ms->max_footprint;
  }
  else {
    USAGE_ERROR_ACTION(ms,ms);
  }
  return result;
}

size_t mspace_footprint_limit(mspace msp) {
  size_t result = 0;
  mstate ms = (mstate)msp;
  if (ok_magic(ms)) {
    size_t maf = ms->footprint_limit;
    result = (maf == 0) ? MAX_SIZE_T : maf;
  }
  else {
    USAGE_ERROR_ACTION(ms,ms);
  }
  return result;
}

size_t mspace_set_footprint_limit(mspace msp, size_t bytes) {
  size_t result = 0;
  mstate ms = (mstate)msp;
  if (ok_magic(ms)) {
    if (bytes == 0)
      result = granularity_align(1); /* Use minimal size */
    if (bytes == MAX_SIZE_T)
      result = 0;                    /* disable */
    else
      result = granularity_align(bytes);
    ms->footprint_limit = result;
  }
  else {
    USAGE_ERROR_ACTION(ms,ms);
  }
  return result;
}

#if !NO_MALLINFO
struct mallinfo mspace_mallinfo(mspace msp) {
  mstate ms = (mstate)msp;
  if (!ok_magic(ms)) {
    USAGE_ERROR_ACTION(ms,ms);
  }
  return internal_mallinfo(ms);
}
#endif /* NO_MALLINFO */

size_t mspace_usable_size(const void* mem) {
  if (mem != 0) {
    mchunkptr p = mem2chunk(mem);
    if (is_inuse(p))
      return chunksize(p) - overhead_for(p);
  }
  return 0;
}

int mspace_mallopt(int param_number, int value) {
  return change_mparam(param_number, value);
}

#endif /* MSPACES */


/* -------------------- Alternative MORECORE functions ------------------- */

/*
  Guidelines for creating a custom version of MORECORE:

  * For best performance, MORECORE should allocate in multiples of pagesize.
  * MORECORE may allocate more memory than requested. (Or even less,
      but this will usually result in a malloc failure.)
  * MORECORE must not allocate memory when given argument zero, but
      instead return one past the end address of memory from previous
      nonzero call.
  * For best performance, consecutive calls to MORECORE with positive
      arguments should return increasing addresses, indicating that
      space has been contiguously extended.
  * Even though consecutive calls to MORECORE need not return contiguous
      addresses, it must be OK for malloc'ed chunks to span multiple
      regions in those cases where they do happen to be contiguous.
  * MORECORE need not handle negative arguments -- it may instead
      just return MFAIL when given negative arguments.
      Negative arguments are always multiples of pagesize. MORECORE
      must not misinterpret negative args as large positive unsigned
      args. You can suppress all such calls from even occurring by defining
      MORECORE_CANNOT_TRIM,

  As an example alternative MORECORE, here is a custom allocator
  kindly contributed for pre-OSX macOS.  It uses virtually but not
  necessarily physically contiguous non-paged memory (locked in,
  present and won't get swapped out).  You can use it by uncommenting
  this section, adding some #includes, and setting up the appropriate
  defines above:

      #define MORECORE osMoreCore

  There is also a shutdown routine that should somehow be called for
  cleanup upon program exit.

  #define MAX_POOL_ENTRIES 100
  #define MINIMUM_MORECORE_SIZE  (64 * 1024U)
  static int next_os_pool;
  void *our_os_pools[MAX_POOL_ENTRIES];

  void *osMoreCore(int size)
  {
    void *ptr = 0;
    static void *sbrk_top = 0;

    if (size > 0)
    {
      if (size < MINIMUM_MORECORE_SIZE)
         size = MINIMUM_MORECORE_SIZE;
      if (CurrentExecutionLevel() == kTaskLevel)
         ptr = PoolAllocateResident(size + RM_PAGE_SIZE, 0);
      if (ptr == 0)
      {
        return (void *) MFAIL;
      }
      // save ptrs so they can be freed during cleanup
      our_os_pools[next_os_pool] = ptr;
      next_os_pool++;
      ptr = (void *) ((((size_t) ptr) + RM_PAGE_MASK) & ~RM_PAGE_MASK);
      sbrk_top = (char *) ptr + size;
      return ptr;
    }
    else if (size < 0)
    {
      // we don't currently support shrink behavior
      return (void *) MFAIL;
    }
    else
    {
      return sbrk_top;
    }
  }

  // cleanup any allocated memory pools
  // called as last thing before shutting down driver

  void osCleanupMem(void)
  {
    void **ptr;

    for (ptr = our_os_pools; ptr < &our_os_pools[MAX_POOL_ENTRIES]; ptr++)
      if (*ptr)
      {
         PoolDeallocate(*ptr);
         *ptr = 0;
      }
  }

*/


/* -----------------------------------------------------------------------
History:
    v2.8.6 Wed Aug 29 06:57:58 2012  Doug Lea
      * fix bad comparison in dlposix_memalign
      * don't reuse adjusted asize in sys_alloc
      * add LOCK_AT_FORK -- thanks to Kirill Artamonov for the suggestion
      * reduce compiler warnings -- thanks to all who reported/suggested these

    v2.8.5 Sun May 22 10:26:02 2011  Doug Lea  (dl at gee)
      * Always perform unlink checks unless INSECURE
      * Add posix_memalign.
      * Improve realloc to expand in more cases; expose realloc_in_place.
        Thanks to Peter Buhr for the suggestion.
      * Add footprint_limit, inspect_all, bulk_free. Thanks
        to Barry Hayes and others for the suggestions.
      * Internal refactorings to avoid calls while holding locks
      * Use non-reentrant locks by default. Thanks to Roland McGrath
        for the suggestion.
      * Small fixes to mspace_destroy, reset_on_error.
      * Various configuration extensions/changes. Thanks
         to all who contributed these.

    V2.8.4a Thu Apr 28 14:39:43 2011 (dl at gee.cs.oswego.edu)
      * Update Creative Commons URL

    V2.8.4 Wed May 27 09:56:23 2009  Doug Lea  (dl at gee)
      * Use zeros instead of prev foot for is_mmapped
      * Add mspace_track_large_chunks; thanks to Jean Brouwers
      * Fix set_inuse in internal_realloc; thanks to Jean Brouwers
      * Fix insufficient sys_alloc padding when using 16byte alignment
      * Fix bad error check in mspace_footprint
      * Adaptations for ptmalloc; thanks to Wolfram Gloger.
      * Reentrant spin locks; thanks to Earl Chew and others
      * Win32 improvements; thanks to Niall Douglas and Earl Chew
      * Add NO_SEGMENT_TRAVERSAL and MAX_RELEASE_CHECK_RATE options
      * Extension hook in malloc_state
      * Various small adjustments to reduce warnings on some compilers
      * Various configuration extensions/changes for more platforms. Thanks
         to all who contributed these.

    V2.8.3 Thu Sep 22 11:16:32 2005  Doug Lea  (dl at gee)
      * Add max_footprint functions
      * Ensure all appropriate literals are size_t
      * Fix conditional compilation problem for some #define settings
      * Avoid concatenating segments with the one provided
        in create_mspace_with_base
      * Rename some variables to avoid compiler shadowing warnings
      * Use explicit lock initialization.
      * Better handling of sbrk interference.
      * Simplify and fix segment insertion, trimming and mspace_destroy
      * Reinstate REALLOC_ZERO_BYTES_FREES option from 2.7.x
      * Thanks especially to Dennis Flanagan for help on these.

    V2.8.2 Sun Jun 12 16:01:10 2005  Doug Lea  (dl at gee)
      * Fix memalign brace error.

    V2.8.1 Wed Jun  8 16:11:46 2005  Doug Lea  (dl at gee)
      * Fix improper #endif nesting in C++
      * Add explicit casts needed for C++

    V2.8.0 Mon May 30 14:09:02 2005  Doug Lea  (dl at gee)
      * Use trees for large bins
      * Support mspaces
      * Use segments to unify sbrk-based and mmap-based system allocation,
        removing need for emulation on most platforms without sbrk.
      * Default safety checks
      * Optional footer checks. Thanks to William Robertson for the idea.
      * Internal code refactoring
      * Incorporate suggestions and platform-specific changes.
        Thanks to Dennis Flanagan, Colin Plumb, Niall Douglas,
        Aaron Bachmann,  Emery Berger, and others.
      * Speed up non-fastbin processing enough to remove fastbins.
      * Remove useless cfree() to avoid conflicts with other apps.
      * Remove internal memcpy, memset. Compilers handle builtins better.
      * Remove some options that no one ever used and rename others.

    V2.7.2 Sat Aug 17 09:07:30 2002  Doug Lea  (dl at gee)
      * Fix malloc_state bitmap array misdeclaration

    V2.7.1 Thu Jul 25 10:58:03 2002  Doug Lea  (dl at gee)
      * Allow tuning of FIRST_SORTED_BIN_SIZE
      * Use PTR_UINT as type for all ptr->int casts. Thanks to John Belmonte.
      * Better detection and support for non-contiguousness of MORECORE.
        Thanks to Andreas Mueller, Conal Walsh, and Wolfram Gloger
      * Bypass most of malloc if no frees. Thanks To Emery Berger.
      * Fix freeing of old top non-contiguous chunk im sysmalloc.
      * Raised default trim and map thresholds to 256K.
      * Fix mmap-related #defines. Thanks to Lubos Lunak.
      * Fix copy macros; added LACKS_FCNTL_H. Thanks to Neal Walfield.
      * Branch-free bin calculation
      * Default trim and mmap thresholds now 256K.

    V2.7.0 Sun Mar 11 14:14:06 2001  Doug Lea  (dl at gee)
      * Introduce independent_comalloc and independent_calloc.
        Thanks to Michael Pachos for motivation and help.
      * Make optional .h file available
      * Allow > 2GB requests on 32bit systems.
      * new WIN32 sbrk, mmap, munmap, lock code from <Walter@GeNeSys-e.de>.
        Thanks also to Andreas Mueller <a.mueller at paradatec.de>,
        and Anonymous.
      * Allow override of MALLOC_ALIGNMENT (Thanks to Ruud Waij for
        helping test this.)
      * memalign: check alignment arg
      * realloc: don't try to shift chunks backwards, since this
        leads to  more fragmentation in some programs and doesn't
        seem to help in any others.
      * Collect all cases in malloc requiring system memory into sysmalloc
      * Use mmap as backup to sbrk
      * Place all internal state in malloc_state
      * Introduce fastbins (although similar to 2.5.1)
      * Many minor tunings and cosmetic improvements
      * Introduce USE_PUBLIC_MALLOC_WRAPPERS, USE_MALLOC_LOCK
      * Introduce MALLOC_FAILURE_ACTION, MORECORE_CONTIGUOUS
        Thanks to Tony E. Bennett <tbennett@nvidia.com> and others.
      * Include errno.h to support default failure action.

    V2.6.6 Sun Dec  5 07:42:19 1999  Doug Lea  (dl at gee)
      * return null for negative arguments
      * Added Several WIN32 cleanups from Martin C. Fong <mcfong at yahoo.com>
         * Add 'LACKS_SYS_PARAM_H' for those systems without 'sys/param.h'
          (e.g. WIN32 platforms)
         * Cleanup header file inclusion for WIN32 platforms
         * Cleanup code to avoid Microsoft Visual C++ compiler complaints
         * Add 'USE_DL_PREFIX' to quickly allow co-existence with existing
           memory allocation routines
         * Set 'malloc_getpagesize' for WIN32 platforms (needs more work)
         * Use 'assert' rather than 'ASSERT' in WIN32 code to conform to
           usage of 'assert' in non-WIN32 code
         * Improve WIN32 'sbrk()' emulation's 'findRegion()' routine to
           avoid infinite loop
      * Always call 'fREe()' rather than 'free()'

    V2.6.5 Wed Jun 17 15:57:31 1998  Doug Lea  (dl at gee)
      * Fixed ordering problem with boundary-stamping

    V2.6.3 Sun May 19 08:17:58 1996  Doug Lea  (dl at gee)
      * Added pvalloc, as recommended by H.J. Liu
      * Added 64bit pointer support mainly from Wolfram Gloger
      * Added anonymously donated WIN32 sbrk emulation
      * Malloc, calloc, getpagesize: add optimizations from Raymond Nijssen
      * malloc_extend_top: fix mask error that caused wastage after
        foreign sbrks
      * Add linux mremap support code from HJ Liu

    V2.6.2 Tue Dec  5 06:52:55 1995  Doug Lea  (dl at gee)
      * Integrated most documentation with the code.
      * Add support for mmap, with help from
        Wolfram Gloger (Gloger@lrz.uni-muenchen.de).
      * Use last_remainder in more cases.
      * Pack bins using idea from  colin@nyx10.cs.du.edu
      * Use ordered bins instead of best-fit threshhold
      * Eliminate block-local decls to simplify tracing and debugging.
      * Support another case of realloc via move into top
      * Fix error occuring when initial sbrk_base not word-aligned.
      * Rely on page size for units instead of SBRK_UNIT to
        avoid surprises about sbrk alignment conventions.
      * Add mallinfo, mallopt. Thanks to Raymond Nijssen
        (raymond@es.ele.tue.nl) for the suggestion.
      * Add `pad' argument to malloc_trim and top_pad mallopt parameter.
      * More precautions for cases where other routines call sbrk,
        courtesy of Wolfram Gloger (Gloger@lrz.uni-muenchen.de).
      * Added macros etc., allowing use in linux libc from
        H.J. Lu (hjl@gnu.ai.mit.edu)
      * Inverted this history list

    V2.6.1 Sat Dec  2 14:10:57 1995  Doug Lea  (dl at gee)
      * Re-tuned and fixed to behave more nicely with V2.6.0 changes.
      * Removed all preallocation code since under current scheme
        the work required to undo bad preallocations exceeds
        the work saved in good cases for most test programs.
      * No longer use return list or unconsolidated bins since
        no scheme using them consistently outperforms those that don't
        given above changes.
      * Use best fit for very large chunks to prevent some worst-cases.
      * Added some support for debugging

    V2.6.0 Sat Nov  4 07:05:23 1995  Doug Lea  (dl at gee)
      * Removed footers when chunks are in use. Thanks to
        Paul Wilson (wilson@cs.texas.edu) for the suggestion.

    V2.5.4 Wed Nov  1 07:54:51 1995  Doug Lea  (dl at gee)
      * Added malloc_trim, with help from Wolfram Gloger
        (wmglo@Dent.MED.Uni-Muenchen.DE).

    V2.5.3 Tue Apr 26 10:16:01 1994  Doug Lea  (dl at g)

    V2.5.2 Tue Apr  5 16:20:40 1994  Doug Lea  (dl at g)
      * realloc: try to expand in both directions
      * malloc: swap order of clean-bin strategy;
      * realloc: only conditionally expand backwards
      * Try not to scavenge used bins
      * Use bin counts as a guide to preallocation
      * Occasionally bin return list chunks in first scan
      * Add a few optimizations from colin@nyx10.cs.du.edu

    V2.5.1 Sat Aug 14 15:40:43 1993  Doug Lea  (dl at g)
      * faster bin computation & slightly different binning
      * merged all consolidations to one part of malloc proper
         (eliminating old malloc_find_space & malloc_clean_bin)
      * Scan 2 returns chunks (not just 1)
      * Propagate failure in realloc if malloc returns 0
      * Add stuff to allow compilation on non-ANSI compilers
          from kpv@research.att.com

    V2.5 Sat Aug  7 07:41:59 1993  Doug Lea  (dl at g.oswego.edu)
      * removed potential for odd address access in prev_chunk
      * removed dependency on getpagesize.h
      * misc cosmetics and a bit more internal documentation
      * anticosmetics: mangled names in macros to evade debugger strangeness
      * tested on sparc, hp-700, dec-mips, rs6000
          with gcc & native cc (hp, dec only) allowing
          Detlefs & Zorn comparison study (in SIGPLAN Notices.)

    Trial version Fri Aug 28 13:14:29 1992  Doug Lea  (dl at g.oswego.edu)
      * Based loosely on libg++-1.2X malloc. (It retains some of the overall
         structure of old version,  but most details differ.)

*/
}


================================================
FILE: ptms/romuluslr/RomulusLR.cpp
================================================

#include "RomulusLR.hpp"


namespace romuluslr{
    /*
     * <h1> RomulusLR </h1>
     * TODO: explain this...
     *
     */

// Global with the 'main' size. Used by pload()
uint64_t g_main_size = 0;
// Global with the 'main' addr. Used by pload()
uint8_t* g_main_addr = 0;

uint8_t* g_main_addr_end;

alignas(128) bool g_right = false;

thread_local int tl_lrromulus = 0;
// Counter of nested write transactions
thread_local int64_t tl_nested_write_trans = 0;
// Counter of nested read-only transactions
thread_local int64_t tl_nested_read_trans = 0;

RomulusLR gRomLR {};
RomulusLR* romlr = nullptr;

}


================================================
FILE: ptms/romuluslr/RomulusLR.hpp
================================================
/*
 * Copyright 2017-2018
 *   Andreia Correia <andreia.veiga@unine.ch>
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Pedro Ramalhete <pramalhe@gmail.com>
 *
 * This work is published under the MIT license. See LICENSE.TXT
 */
#ifndef _ROMULUS_LR_H_
#define _ROMULUS_LR_H_

#include <atomic>
#include <cstdint>
#include <cassert>
#include <string>
#include <cstring>      // std::memcpy()
#include <sys/mman.h>   // Needed if we use mmap()
#include <sys/types.h>  // Needed by open() and close()
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>     // Needed by close()
#include <stdio.h>
#include <iostream>
#include <algorithm>
#include <vector>
#include <functional>
#include <thread>

#include "../common/pfences.h"
#include "../common/ThreadRegistry.hpp"

/* <h1> Romulus using Left-Right plus flat-combining </h1>
 *
 * Romulus using Left-Right plus flat-combining.
 * It provides wait-free (population oblivious) progress for readers and blocking (starvationg-free) for writers.
 *
 * Because we wanted the user to just include this header, we have put everything in the header,
 * which isn't pretty, but it makes life easier for application developers.
 *
 * Left-Right paper: https://github.com/pramalhe/ConcurrencyFreaks/blob/master/papers/left-right-2014.pdf
 * Flat Combining paper:  http://dl.acm.org/citation.cfm?id=1810540
 * A post about Left-Right with Flat Combining: http://concurrencyfreaks.com/2017/07/left-right-and-c-rw-wp-with-flat.html
 *
 * We have the following classes in this file:
 * RIStaticPerThread -> A ReadIndicator to be used in the URCU inside Left-Right
 * RomulusLR         -> Romulus using Left-Right and Flat-Combining for Writers
 * persist<T>        -> Annotation for persistent types T
 *
 * How to use RomulusLR:
 * persist<UserObject> obj;
 * gLRTM.write_transaction([&obj] () {obj.mutative_method()} );
 * gLRTM.read_transaction([&obj] () {obj.non_mutative_method()} );
 *
 * DO NOT modify global variables or thread-locals inside a write_transaction()
 * because they will be modified twice.
 * DO NOT pass by reference stuff to the lambda and then modify it inside the
 * lambda because the lambda is executed twice.
 */
namespace romuluslr {

extern uint64_t g_main_size;
extern uint8_t* g_main_addr;
extern uint8_t* g_main_addr_end;
extern bool g_right;

class RIStaticPerThread {
private:
    static const uint64_t NOT_READING = 0;
    static const uint64_t READING = 1;
    static const int CLPAD = 128/sizeof(uint64_t);

    const int maxThreads;
    alignas(128) std::atomic<uint64_t>* states;

public:
    RIStaticPerThread(int maxThreads) : maxThreads{maxThreads} {
        states = new std::atomic<uint64_t>[maxThreads*CLPAD];
        for (int tid = 0; tid < maxThreads; tid++) {
            states[tid*CLPAD].store(NOT_READING, std::memory_order_relaxed);
        }
    }

    ~RIStaticPerThread() {
        delete[] states;
    }

    inline void arrive(const int tid) noexcept {
        states[tid*CLPAD].store(READING);
    }

    inline void depart(const int tid) noexcept {
        states[tid*CLPAD].store(NOT_READING, std::memory_order_release);
    }

    inline bool isEmpty() noexcept {
        for (int tid = 0; tid < ThreadRegistry::getMaxThreads(); tid++) {
            if (states[tid*CLPAD].load() != NOT_READING) return false;
        }
        return true;
    }
};


// Forward declaration for global instance of RomulusLR
class RomulusLR;
extern RomulusLR gRomLR;
// Instance with current ongoing transaction
extern RomulusLR* romlr;
// Counter of nested write transactions
extern thread_local int64_t tl_nested_write_trans;
// Counter of nested read-only transactions
extern thread_local int64_t tl_nested_read_trans;


// Possible values for 'leftRight' and tl_lrromulus variable
static const int TRAVERSE_LEFT = 0;
static const int TRAVERSE_RIGHT = 1;
// This variable indicates which of the instances to be used by the ongoing transaction
extern thread_local int tl_lrromulus;

// Doug Lea's allocator declarations
typedef void* mspace;
extern void* mspace_malloc(mspace msp, size_t bytes);
extern void mspace_free(mspace msp, void* mem);
extern mspace create_mspace_with_base(void* base, size_t capacity, int locked);


class RomulusLR {
    // Id for sanity check of Romulus
    static const uint64_t MAGIC_ID = 0x1337BAB5;

    // Possible values for "state"
    static const int IDLE = 0;
    static const int MUTATING = 1;
    static const int COPYING = 2;

    // Number of log entries in a chunk of the log
    static const int CHUNK_SIZE = 1024;

    // Filename for the mapping file
    const char* MMAP_FILENAME = "/dev/shm/romuluslr_shared";

    // Member variables
    bool dommap;
    int fd = -1;
    uint8_t* base_addr;
    uint64_t max_size;
    uint8_t* main_addr;
    uint8_t* back_addr;

    // Each log entry is two words (8+8 = 16 bytes)
    struct LogEntry {
        size_t    offset;  // Pointer offset in bytes, relative to main_addr
        uint64_t  length;  // Range length of data at pointer offset
    };

    struct LogChunk {
        LogEntry  entries[CHUNK_SIZE];
        uint64_t  num_entries { 0 };
        LogChunk* next        { nullptr };
    };

    // There is always at least one (empty) chunk in the log, it's the head
    LogChunk* log_head = new LogChunk;
    LogChunk* log_tail = log_head;

    // One instance of this is at the start of base_addr, in persistent memory
    struct PersistentHeader {
        uint64_t         id {0};          // Validates intialization
        std::atomic<int> state {IDLE}; // Current state of consistency
        void**           objects {};      // Objects directory
        mspace           ms {};           // Pointer to allocator's metadata
        uint64_t         used_size {0};   // It has to be the last, to calculate the used_size
    };

    PersistentHeader* per {nullptr};      // Volatile pointer to start of persistent memory
    uint64_t log_size = 0;
    bool logEnabled = true;

private:
    static const int CLPAD = 128/sizeof(uintptr_t);
    static const int LOCKED = 1;
    static const int UNLOCKED = 0;
    const int maxThreads;
    // Stuff use by the Flat Combining mechanism
    alignas(128) std::atomic< std::function<void()>* >* fc; // array of atomic pointers to functions
    // Stuff used by the Left-Right mechanism
    alignas(128) std::atomic<int> writersMutex { UNLOCKED };
    alignas(128) std::atomic<int> leftRight { TRAVERSE_LEFT };
    alignas(128) std::atomic<int> versionIndex { 0 };
    RIStaticPerThread ri[2] { REGISTRY_MAX_THREADS, REGISTRY_MAX_THREADS };


    //
    // Private methods
    //
    // Flush touched cache lines
    inline void flush_range(uint8_t* addr, size_t length) {
        const int cache_line_size = 64;
        uint8_t* ptr = addr;
        uint8_t* last = addr + length;
        for (; ptr < last; ptr += cache_line_size) PWB(ptr);
    }
    void copyMainToBack() {
        // Copy the data from 'main' to 'back'
        uint64_t size = std::min(per->used_size, g_main_size);
        std::memcpy(back_addr, main_addr, size);
        flush_range(back_addr, size);
    }

    void copyBackToMain() {
        // Copy the data from 'back' to 'main'
        uint64_t size = std::min(per->used_size, g_main_size);
        std::memcpy(main_addr, back_addr, size);
        flush_range(main_addr, size);
    }


    bool compareMainAndBack() {
        if (std::memcmp(main_addr, back_addr, g_main_size) != 0) {
            void* firstaddr = nullptr;
            int sumdiff = 0;
            for (size_t idx = 0; idx < g_main_size-sizeof(size_t); idx++) {
                if (*(main_addr+idx) != *(back_addr+idx)) {
                    printf("Difference at %p  main=%ld  back=%ld\n", main_addr+idx, *(int64_t*)(main_addr+idx), *(int64_t*)(back_addr+idx));
                    sumdiff++;
                    if (firstaddr == nullptr) firstaddr = main_addr+idx;
                }
            }
            if (sumdiff != 0) {
                printf("sumdiff=%d bytes\n", sumdiff);
                printf("\nThere seems to be a missing persist<T> in your code.\n");
                printf("Rerun with gdb and set a watchpoint using the command\nwatch * %p\n\n", firstaddr);
            }
            assert(sumdiff == 0);
        }
        return true;
    }

    /*
     * Called at the end of a transaction to replicate the mutations on "back",
     * or when abort_transaction() is called by the user, to rollback the
     * mutations on "main".
     * Deletes the log as it is being applied.
     */
    inline void apply_pwb(uint8_t* from_addr) {
        // Apply the log to the instance on 'to_addr', copying data from the instance at 'from_addr'
        LogChunk* chunk = log_head;
        while (chunk != nullptr) {
            for (int i = 0; i < chunk->num_entries; i++) {
                LogEntry& e = chunk->entries[i];
                //std::memcpy(to_addr + e.offset, from_addr + e.offset, e.length);
                flush_range(from_addr + e.offset, e.length);
            }
            chunk = chunk->next;
        }
    }

    /*
     * Called at the end of a transaction to replicate the mutations on "back",
     * or when abort_transaction() is called by the user, to rollback the
     * mutations on "main".
     * Deletes the log as it is being applied.
     */
    inline void apply_log(uint8_t* from_addr, uint8_t* to_addr) {
        // Apply the log to the instance on 'to_addr', copying data from the instance at 'from_addr'
        LogChunk* chunk = log_head;
        while (chunk != nullptr) {
            for (int i = 0; i < chunk->num_entries; i++) {
                LogEntry& e = chunk->entries[i];
                std::memcpy(to_addr + e.offset, from_addr + e.offset, e.length);
                //flush_range(to_addr + e.offset, e.length);
            }
            //LogChunk* next = chunk->next;
            //if (chunk != log_head) delete chunk;
            chunk = chunk->next;
        }
        // Clear the log, leaving one chunk for next transaction, with zero'ed entries
        /*
        log_tail = log_head;
        log_head->num_entries = 0;
        log_head->next = nullptr;*/
    }

    inline void clear_log() {
        LogChunk* chunk = log_head->next;
        while (chunk != nullptr) {
            LogChunk* next = chunk->next;
            delete chunk;
            chunk = next;
        }
        // Clear the log, leaving one chunk for next transaction, with zero'ed entries
        log_tail = log_head;
        log_head->num_entries = 0;
        log_head->next = nullptr;
    }

    inline void toggleVersionAndWait() {
        const int localVI = versionIndex.load();
        const int prevVI = localVI & 0x1;
        const int nextVI = (localVI+1) & 0x1;
        // Wait for Readers from next version
        while (!ri[nextVI].isEmpty()) {} // spin
        // Toggle the versionIndex variable
        versionIndex.store(nextVI);
        // Wait for Readers from previous version
        while (!ri[prevVI].isEmpty()) {} // spin
    }


public:
    /*
     * Adds to the log the current contents of the memory location starting at
     * 'addr' with a certain 'length' in bytes
     */
    inline void add_to_log(void* addr, int length) noexcept {

        if (!logEnabled) return;
        // If the log has more than 1/4 of the entire size then skip the log
        // and copy the used size of the main region.
        if (log_size > per->used_size/4) {
            logEnabled = false;
            return;
        }
        size_t addrCL = ((size_t)addr)>>6;
        // Get the current chunk of log and if it is already full then create a new chunk and add the entry there.
        LogChunk* chunk = log_tail;
        bool sameCL = false;

        if(addrCL == (size_t)((uint8_t*)addr+length)>>6){
        	sameCL = true;
        	int size = chunk->num_entries;
        	for(int i=size-1;i>=0 && i>size-16;i--){
				LogEntry& e1 = chunk->entries[i];

				size_t offCL = (size_t)(e1.offset+main_addr)>>6;
				if(e1.length==64 && (size_t)(offCL<<6) == (size_t)(e1.offset+main_addr)){
					if(offCL == addrCL) return;
				}
        	}
        }
        if (chunk->num_entries == CHUNK_SIZE) {
            chunk = new LogChunk();
            log_tail->next = chunk;
            log_tail = chunk;
        }
        LogEntry& e = chunk->entries[chunk->num_entries];

        if(sameCL){
        	size_t cl =addrCL<<6;
            e.offset = (uint8_t*)cl - main_addr;
        	e.length = 64;
        }else {
        	e.offset = (uint8_t*)addr - main_addr;
        	e.length = length;
        }
        log_size+=length;
        chunk->num_entries++;
    }

    RomulusLR() : dommap{true},maxThreads{128}{

        fc = new std::atomic< std::function<void()>* >[maxThreads*CLPAD];
        for (int i = 0; i < maxThreads; i++) {
            fc[i*CLPAD].store(nullptr, std::memory_order_relaxed);
        }
        romlr = this;
        /*if (dommap) {

        }*/
        ns_init();
    }


    ~RomulusLR() {
        delete[] fc;
        // Must do munmap() if we did mmap()
        if (dommap) {
            //destroy_mspace(ms);
            munmap(base_addr, max_size);
            close(fd);
        }
    }

    void ns_init(){
    	base_addr = (uint8_t*)0x7fdd80000000;
		max_size = 400*1024*1024; // 400 Mb => 200 Mb for the user
		// Check if the file already exists or not
		struct stat buf;
		if (stat(MMAP_FILENAME, &buf) == 0) {
			// File exists
			//std::cout << "Re-using memory region\n";
			fd = open(MMAP_FILENAME, O_RDWR|O_CREAT, 0755);
			assert(fd >= 0);
			// mmap() memory range
			uint8_t* got_addr = (uint8_t *)mmap(base_addr, max_size, (PROT_READ | PROT_WRITE), MAP_SHARED, fd, 0);
			if (got_addr == MAP_FAILED) {
				printf("got_addr = %p  %p\n", got_addr, MAP_FAILED);
				perror("ERROR: mmap() is not working !!! ");
				assert(false);
			}
			per = reinterpret_cast<PersistentHeader*>(base_addr);
			if (per->id != MAGIC_ID) createFile();
			g_main_size = (max_size - sizeof(PersistentHeader))/2;
			main_addr = base_addr + sizeof(PersistentHeader);
			back_addr = main_addr + g_main_size;
			g_main_addr = main_addr;
			g_main_addr_end = main_addr + g_main_size;
			g_right = false;
			recover();
		} else {
			createFile();
		}
    }

    void createFile(){
        // File doesn't exist
        fd = open(MMAP_FILENAME, O_RDWR|O_CREAT, 0755);
        assert(fd >= 0);
        if (lseek(fd, max_size-1, SEEK_SET) == -1) {
            perror("lseek() error");
        }
        if (write(fd, "", 1) == -1) {
            perror("write() error");
        }
        // mmap() memory range
        uint8_t* got_addr = (uint8_t *)mmap(base_addr, max_size, (PROT_READ | PROT_WRITE), MAP_SHARED, fd, 0);
        if (got_addr == MAP_FAILED) {
            printf("got_addr = %p  %p\n", got_addr, MAP_FAILED);
            perror("ERROR: mmap() is not working !!! ");
            assert(false);
        }
        // No data in persistent memory, initialize
        per = new (base_addr) PersistentHeader;
        g_main_size = (max_size - sizeof(PersistentHeader))/2;
        main_addr = base_addr + sizeof(PersistentHeader);
        back_addr = main_addr + g_main_size;
        g_main_addr = main_addr;
        g_main_addr_end = main_addr + g_main_size;

        // We need to call create_mspace_with_base() from within a transaction so that
        // the modifications on 'main' get replicated on 'back'. This means we temporarily
        // need to set the 'used_size' to 'main_size' to make sure everything is copied.
        g_right = false;
        begin_transaction();

        // Just to force the copy of the whole main region
        per->used_size = g_main_size;
        per->ms = create_mspace_with_base(main_addr, g_main_size, false);
        per->objects = (void**)mspace_malloc(per->ms, sizeof(void*)*100);
        for (int i = 0; i < 100; i++) {
            per->objects[i] = nullptr;
            add_to_log(&per->objects[i],sizeof(void*));
            PWB(&per->objects[i]);
        }
        end_transaction();

        // The used bytes in the main region
        per->used_size = (uint8_t*)(&per->used_size) - ((uint8_t*)base_addr+sizeof(PersistentHeader))+128;
        flush_range((uint8_t*)per,sizeof(PersistentHeader));
        PFENCE();
        // Finally, set the id to confirm that the whole initialization process has completed
        per->id = MAGIC_ID;
        PWB(&per->id);
        PSYNC();

    }

    static std::string className() { return "RomulusLR"; }

    template <typename T>
    static inline T* get_object(int idx) {
        if (tl_lrromulus == TRAVERSE_LEFT) {
            return static_cast<T*>(gRomLR.per->objects[idx]);
        } else {
            return reinterpret_cast<T*>( *(size_t*)((uint8_t*)&(gRomLR.per->objects[idx]) + g_main_size) );
        }
    }

    template <typename T>
    static inline void put_object(int idx, T* obj) {
        gRomLR.per->objects[idx] = obj;
        gRomLR.add_to_log(&(gRomLR.per->objects[idx]), sizeof(void*));
        PWB(&(gRomLR.per->objects[idx]));
    }

    /*
     * Must be called at the beginning of each (write) transaction.
     */
    inline void begin_transaction() {
        // Check for nested transaction
    	tl_nested_write_trans++;
        if (tl_nested_write_trans >1) return;

        per->state.store(MUTATING, std::memory_order_relaxed);
        PWB(&per->state);
        // One PFENCE() is enough for all user modifications because no ordering is needed between them.
        PFENCE();
    }


    /*
     * Must be called at the end of each (write) transaction.
     */
    inline void end_transaction() {
        // Check for nested transaction
    	--tl_nested_write_trans;
        if (tl_nested_write_trans >0) return;
        // Do a PFENCE() to make persistent the stores done in 'main' and on the Romulus persistent
        // data (due to memory allocation).
        // We only care about ordering here, not about durability, therefore, no need to block.
        PFENCE();
        per->state.store(COPYING, std::memory_order_relaxed);        /* str_rel */
        PWB(&per->state);
        PWB(&per->used_size);
        // PSYNC() here to have ACID Durability on the mutations done to "main" and make the change of state visible
        PSYNC();
        // Apply log, copying data from 'main' to 'back'
        if (logEnabled) {
            apply_log(main_addr, back_addr);
        } else {
            copyMainToBack();
            clear_log();
            logEnabled = true;
        }
        log_size = 0;
        PFENCE();
        per->state.store(IDLE, std::memory_order_relaxed);
    }


    /*
     * Recovers from an incomplete transaction if needed
     */
    inline void recover() {
        int lstate = per->state.load(std::memory_order_relaxed);
        if (lstate == IDLE) {
            return;
        } else if (lstate == COPYING) {
            printf("RomulusLR: Recovery from COPYING...\n");
            copyMainToBack();
        } else if (lstate == MUTATING) {
            printf("RomulusLR: Recovery from MUTATING...\n");
            copyBackToMain();
        } else {
            assert(false);
            // ERROR: corrupted state
        }
        PFENCE();
        per->state.store(IDLE, std::memory_order_relaxed);
        return;
    }


    /*
     * Same as begin/end transaction, but with a lambda.
     * Calling abort_transaction() from within the lambda is not allowed.
     */
    template<typename R, class F>
    R transaction(F&& func) {
        begin_transaction();
        R retval = func();
        end_transaction();
        return retval;
    }

    template<class F>
    static void transaction(F&& func) {
        gRomLR.begin_transaction();
        func();
        gRomLR.end_transaction();
    }


    /*
     * Non static, thread-safe
     * Progress: Blocking (starvation-free)
     */
    template<typename Func>
    void ns_write_transaction(Func&& mutativeFunc) {
        if (tl_nested_write_trans > 0) {
            mutativeFunc();
            return;
        }
        std::function<void()> myfunc = mutativeFunc;
        int tid = ThreadRegistry::getTID();
        // Add our mutation to the array of flat combining
        fc[tid*CLPAD].store(&myfunc, std::memory_order_release);

        // Lock writersMutex
        while (true) {
            int unlocked = UNLOCKED;
            if (writersMutex.load() == UNLOCKED &&
                writersMutex.compare_exchange_strong(unlocked, LOCKED)) break;
            // Check if another thread executed my mutation
            if (fc[tid*CLPAD].load(std::memory_order_acquire) == nullptr) return;
            std::this_thread::yield();
        }

        bool somethingToDo = false;
        const int maxTid = ThreadRegistry::getMaxThreads();
        // Save a local copy of the flat combining array
        std::function<void()>* lfc[maxTid];
        for (int i = 0; i < maxTid; i++) {
            lfc[i] = fc[i*CLPAD].load(std::memory_order_acquire);
            if (lfc[i] != nullptr) somethingToDo = true;
        }
        // Check if there is at least one operation to apply
        if (!somethingToDo) {
            writersMutex.store(UNLOCKED, std::memory_order_release);
            return;
        }
        ++tl_nested_write_trans;
        per->state.store(MUTATING, std::memory_order_relaxed);
        PWB(&per->state);
        // One PFENCE() is enough for all user modifications because no ordering is needed between them.
        PFENCE();
        g_right = true;
        // Readers can only see the changes after making sure they are persisted
        leftRight.store(TRAVERSE_RIGHT);
        tl_lrromulus = TRAVERSE_LEFT;
        toggleVersionAndWait();  // This is a synchronize_rcu()
        // Apply all mutativeFunc
        for (int i = 0; i < maxTid; i++) {
            if (lfc[i] == nullptr) continue;
            (*lfc[i])();
        }
        apply_pwb(main_addr);
        PFENCE();
        per->state.store(COPYING, std::memory_order_relaxed);
        PWB(&per->state);
        // PSYNC() here to have ACID Durability on the mutations done to 'main' and make the change of state visible
        PSYNC();
        // Readers can only see the changes after making sure they are persisted
        leftRight.store(TRAVERSE_LEFT);
        toggleVersionAndWait();  // This is a synchronize_rcu()
        g_right = false;
        // After changing state to COPYING all applied mutativeFunc are visible and persisted
        for (int i = 0; i < maxTid; i++) {
            if (lfc[i] == nullptr) continue;
            fc[i*CLPAD].store(nullptr, std::memory_order_release);
        }
        // Apply log, copying data from 'main' to 'back'
        if (logEnabled) {
            apply_log(main_addr, back_addr);
            apply_pwb(back_addr);
        } else {
            copyMainToBack();

            logEnabled = true;
        }
        clear_log();
        log_size = 0;

        PFENCE();
        per->state.store(IDLE, std::memory_order_relaxed);
        // unlock()
        writersMutex.store(UNLOCKED, std::memory_order_release);
        --tl_nested_write_trans;
        //consistency_check();
    }


    /*
     * Non-static
     * Progress: Wait-Free Population Oblivious
     */
    template<typename Func>
    void ns_read_transaction(Func&& readFunc) {
        if (tl_nested_read_trans > 0) {
            readFunc();
            return;
        }
        int tid = ThreadRegistry::getTID();
        const int localVI = versionIndex.load();
        ++tl_nested_read_trans;
        ri[localVI].arrive(tid);  // This is an rcu_read_lock()
        int lr = leftRight.load();
        if(lr!=tl_lrromulus) tl_lrromulus = lr;

        readFunc();
        ri[localVI].depart(tid);  // This is an rcu_read_unlock()
        --tl_nested_read_trans;
    }


    template <typename T, typename... Args>
    static T* alloc(Args&&... args) {
        const RomulusLR& r = gRomLR;
        void* addr = mspace_malloc(r.per->ms, sizeof(T));
        assert(addr != 0);
        T* ptr = new (addr) T(std::forward<Args>(args)...); // placement new
        if (r.per->used_size < (uint8_t*)addr - r.main_addr + sizeof(T) + 128) {
            r.per->used_size = (uint8_t*)addr - r.main_addr + sizeof(T) + 128;
            PWB(&r.per->used_size);
        }
        return ptr;
    }

    template <typename T, typename... Args>
    static T* tmNew(Args&&... args) {
        const RomulusLR& r = gRomLR;
        void* addr = mspace_malloc(r.per->ms, sizeof(T));
        assert(addr != 0);
        T* ptr = new (addr) T(std::forward<Args>(args)...); // placement new
        if (r.per->used_size < (uint8_t*)addr - r.main_addr + sizeof(T) + 128) {
            r.per->used_size = (uint8_t*)addr - r.main_addr + sizeof(T) + 128;
            PWB(&r.per->used_size);
        }
        return ptr;
    }

    /*
     * De-allocator
     * Calls destructor of T and then reclaims the memory using Doug Lea's free
     */
    template<typename T>
    static void free(T* obj) {
        if (obj == nullptr) return;
        obj->~T();
        mspace_free(gRomLR.per->ms,obj);
    }

    template<typename T>
    static void tmDelete(T* obj) {
        if (obj == nullptr) return;
        obj->~T();
        mspace_free(gRomLR.per->ms,obj);
    }

    /* Allocator for C methods (like memcached) */
    static void* pmalloc(size_t size) {
        const RomulusLR& r = gRomLR;
        void* addr = mspace_malloc(r.per->ms, size);
        assert (addr != 0);
        if (r.per->used_size < (uint8_t*)addr - r.main_addr + size + 128) {
            r.per->used_size = (uint8_t*)addr - r.main_addr + size + 128;
            PWB(&r.per->used_size);
        }
        return addr;
    }

    /* De-allocator for C methods (like memcached) */
    static void pfree(void* ptr) {
        return mspace_free(gRomLR.per->ms, ptr);
    }

    static void init() {
    	gRomLR.ns_init();
    }

    template<class F>
    static void read_transaction(F&& func) {
        gRomLR.ns_read_transaction(func);
    }

    template<class F>
    static void write_transaction(F&& func) {
        gRomLR.ns_write_transaction(func);
    }

    template<class F>
    inline static void readTx(F&& func) {
        gRomLR.ns_read_transaction(func);
    }

    template<class F>
    inline static void updateTx(F&& func) {
        gRomLR.ns_write_transaction(func);
    }
    
    // TODO: Remove these two once we make CX have void transactions
    template<typename R,class F>
    inline static R readTx(F&& func) {
        gRomLR.ns_read_transaction([&]() {func();});
        return R{};
    }
    template<typename R,class F>
    inline static R updateTx(F&& func) {
        gRomLR.ns_write_transaction([&]() {func();});
        return R{};
    }
    

    /*
     * Thread-safe. Compares the contents of 'main' and 'back'.
     * This method MUST be called outside a transaction.
     */
    static bool consistency_check(void) {
        if (tl_nested_write_trans > 0) {
            printf("Warning: don't call consistency_check() inside a transaction\n");
        } else {
            while (true) {
                int unlocked = UNLOCKED;
                if (gRomLR.writersMutex.load() == UNLOCKED &&
                    gRomLR.writersMutex.compare_exchange_strong(unlocked, LOCKED)) break;
                std::this_thread::yield();
            }
            gRomLR.compareMainAndBack();
            gRomLR.writersMutex.store(UNLOCKED, std::memory_order_release);
        }
        return true;
    }
};


/*
 * Definition of persist<> type
 * In RomulusLR we interpose the loads and the stores
 */
template<typename T>
struct persist {
    // Stores the actual value (left instance)
    T val {};

    persist() { }

    persist(T initVal) {
        pstore(initVal);
    }

    // Casting operator
    operator T() {
        return pload();
    }

    // Prefix increment operator: ++x
    void operator++ () {
        pstore(pload()+1);
    }

    // Prefix decrement operator: --x
    void operator-- () {
        pstore(pload()-1);
    }

    void operator++ (int) {
        pstore(pload()+1);
    }

    void operator-- (int) {
        pstore(pload()-1);
    }

    // Equals operator: first downcast to T and then compare
    bool operator == (const T& otherval) const {
        return pload() == otherval;
    }

    // Difference operator: first downcast to T and then compare
    bool operator != (const T& otherval) const {
        return pload() != otherval;
    }

    // Relational operators
    bool operator < (const T& rhs) {
        return pload() < rhs;
    }
    bool operator > (const T& rhs) {
        return pload() > rhs;
    }
    bool operator <= (const T& rhs) {
        return pload() <= rhs;
    }
    bool operator >= (const T& rhs) {
        return pload() >= rhs;
    }

    T operator % (const T& rhs) {
        return pload() % rhs;
    }

    // Operator arrow ->
    T operator->() {
        return pload();
    }

    // Operator &. See pload()
    T* operator&() {

       	if(!g_right){
			return &val;
       	}
        if (tl_lrromulus == TRAVERSE_LEFT) return &val;
        const uint8_t* valaddr = (uint8_t*)&val;
        if(valaddr > g_main_addr && valaddr < g_main_addr_end) return reinterpret_cast<T*>( (uint8_t*)&val + g_main_size );
        return &val;
    }

    // Copy constructor
    persist<T>(const persist<T>& other) {
        pstore(other.pload());
    }

    // Assignment operator from another persist<> instance
    persist<T>& operator=(const persist<T>& other) {
        pstore(other.pload());
        return *this;
    }

    // Assignment operator from a value
    persist<T>& operator=(T value) {
        pstore(value);
        return *this;
    }

    persist<T>& operator&=(T value) {
        pstore(pload() & value);
        return *this;
    }

    persist<T>& operator|=(T value) {
        pstore(pload() | value);
        return *this;
    }
    persist<T>& operator+=(T value) {
        pstore(pload() + value);
        return *this;
    }
    persist<T>& operator-=(T value) {
        pstore(pload() - value);
        return *this;
    }

    // Only modifies the left instance
    inline void pstore(T newVal) {
        val = newVal;
        const uint8_t* valaddr = (uint8_t*)&val;
        if (valaddr >= g_main_addr && valaddr < g_main_addr_end) {
            //PWB(&val);
            gRomLR.add_to_log(&val,sizeof(T));
        }
    }

    // Muuuuahahahah!
    // This is the most evil method in the entire repository
    inline T pload() const {
    	if (!g_right || tl_lrromulus == TRAVERSE_LEFT) return val;
        const uint8_t* valaddr = (uint8_t*)&val;
        if (valaddr > g_main_addr && valaddr < g_main_addr_end) return *reinterpret_cast<T*>( (uint8_t*)&val + g_main_size );
        return val;
    }
};


} // end of namespace lrtm

#endif /* _ROMULUS_LR_H_ */


================================================
FILE: ptms/romuluslr/malloc.cpp
================================================
/*
  This is a version (aka dlmalloc) of malloc/free/realloc written by
  Doug Lea and released to the public domain, as explained at
  http://creativecommons.org/publicdomain/zero/1.0/ Send questions,
  comments, complaints, performance data, etc to dl@cs.oswego.edu

* Version 2.8.6 Wed Aug 29 06:57:58 2012  Doug Lea
   Note: There may be an updated version of this malloc obtainable at
           ftp://gee.cs.oswego.edu/pub/misc/malloc.c
         Check before installing!

* Quickstart

  This library is all in one file to simplify the most common usage:
  ftp it, compile it (-O3), and link it into another program. All of
  the compile-time options default to reasonable values for use on
  most platforms.  You might later want to step through various
  compile-time and dynamic tuning options.

  For convenience, an include file for code using this malloc is at:
     ftp://gee.cs.oswego.edu/pub/misc/malloc-2.8.6.h
  You don't really need this .h file unless you call functions not
  defined in your system include files.  The .h file contains only the
  excerpts from this file needed for using this malloc on ANSI C/C++
  systems, so long as you haven't changed compile-time options about
  naming and tuning parameters.  If you do, then you can create your
  own malloc.h that does include all settings by cutting at the point
  indicated below. Note that you may already by default be using a C
  library containing a malloc that is based on some version of this
  malloc (for example in linux). You might still want to use the one
  in this file to customize settings or to avoid overheads associated
  with library versions.

* Vital statistics:

  Supported pointer/size_t representation:       4 or 8 bytes
       size_t MUST be an unsigned type of the same width as
       pointers. (If you are using an ancient system that declares
       size_t as a signed type, or need it to be a different width
       than pointers, you can use a previous release of this malloc
       (e.g. 2.7.2) supporting these.)

  Alignment:                                     8 bytes (minimum)
       This suffices for nearly all current machines and C compilers.
       However, you can define MALLOC_ALIGNMENT to be wider than this
       if necessary (up to 128bytes), at the expense of using more space.

  Minimum overhead per allocated chunk:   4 or  8 bytes (if 4byte sizes)
                                          8 or 16 bytes (if 8byte sizes)
       Each malloced chunk has a hidden word of overhead holding size
       and status information, and additional cross-check word
       if FOOTERS is defined.

  Minimum allocated size: 4-byte ptrs:  16 bytes    (including overhead)
                          8-byte ptrs:  32 bytes    (including overhead)

       Even a request for zero bytes (i.e., malloc(0)) returns a
       pointer to something of the minimum allocatable size.
       The maximum overhead wastage (i.e., number of extra bytes
       allocated than were requested in malloc) is less than or equal
       to the minimum size, except for requests >= mmap_threshold that
       are serviced via mmap(), where the worst case wastage is about
       32 bytes plus the remainder from a system page (the minimal
       mmap unit); typically 4096 or 8192 bytes.

  Security: static-safe; optionally more or less
       The "security" of malloc refers to the ability of malicious
       code to accentuate the effects of errors (for example, freeing
       space that is not currently malloc'ed or overwriting past the
       ends of chunks) in code that calls malloc.  This malloc
       guarantees not to modify any memory locations below the base of
       heap, i.e., static variables, even in the presence of usage
       errors.  The routines additionally detect most improper frees
       and reallocs.  All this holds as long as the static bookkeeping
       for malloc itself is not corrupted by some other means.  This
       is only one aspect of security -- these checks do not, and
       cannot, detect all possible programming errors.

       If FOOTERS is defined nonzero, then each allocated chunk
       carries an additional check word to verify that it was malloced
       from its space.  These check words are the same within each
       execution of a program using malloc, but differ across
       executions, so externally crafted fake chunks cannot be
       freed. This improves security by rejecting frees/reallocs that
       could corrupt heap memory, in addition to the checks preventing
       writes to statics that are always on.  This may further improve
       security at the expense of time and space overhead.  (Note that
       FOOTERS may also be worth using with MSPACES.)

       By default detected errors cause the program to abort (calling
       "abort()"). You can override this to instead proceed past
       errors by defining PROCEED_ON_ERROR.  In this case, a bad free
       has no effect, and a malloc that encounters a bad address
       caused by user overwrites will ignore the bad address by
       dropping pointers and indices to all known memory. This may
       be appropriate for programs that should continue if at all
       possible in the face of programming errors, although they may
       run out of memory because dropped memory is never reclaimed.

       If you don't like either of these options, you can define
       CORRUPTION_ERROR_ACTION and USAGE_ERROR_ACTION to do anything
       else. And if if you are sure that your program using malloc has
       no errors or vulnerabilities, you can define INSECURE to 1,
       which might (or might not) provide a small performance improvement.

       It is also possible to limit the maximum total allocatable
       space, using malloc_set_footprint_limit. This is not
       designed as a security feature in itself (calls to set limits
       are not screened or privileged), but may be useful as one
       aspect of a secure implementation.

  Thread-safety: NOT thread-safe unless USE_LOCKS defined non-zero
       When USE_LOCKS is defined, each public call to malloc, free,
       etc is surrounded with a lock. By default, this uses a plain
       pthread mutex, win32 critical section, or a spin-lock if if
       available for the platform and not disabled by setting
       USE_SPIN_LOCKS=0.  However, if USE_RECURSIVE_LOCKS is defined,
       recursive versions are used instead (which are not required for
       base functionality but may be needed in layered extensions).
       Using a global lock is not especially fast, and can be a major
       bottleneck.  It is designed only to provide minimal protection
       in concurrent environments, and to provide a basis for
       extensions.  If you are using malloc in a concurrent program,
       consider instead using nedmalloc
       (http://www.nedprod.com/programs/portable/nedmalloc/) or
       ptmalloc (See http://www.malloc.de), which are derived from
       versions of this malloc.

  System requirements: Any combination of MORECORE and/or MMAP/MUNMAP
       This malloc can use unix sbrk or any emulation (invoked using
       the CALL_MORECORE macro) and/or mmap/munmap or any emulation
       (invoked using CALL_MMAP/CALL_MUNMAP) to get and release system
       memory.  On most unix systems, it tends to work best if both
       MORECORE and MMAP are enabled.  On Win32, it uses emulations
       based on VirtualAlloc. It also uses common C library functions
       like memset.

  Compliance: I believe it is compliant with the Single Unix Specification
       (See http://www.unix.org). Also SVID/XPG, ANSI C, and probably
       others as well.

* Overview of algorithms

  This is not the fastest, most space-conserving, most portable, or
  most tunable malloc ever written. However it is among the fastest
  while also being among the most space-conserving, portable and
  tunable.  Consistent balance across these factors results in a good
  general-purpose allocator for malloc-intensive programs.

  In most ways, this malloc is a best-fit allocator. Generally, it
  chooses the best-fitting existing chunk for a request, with ties
  broken in approximately least-recently-used order. (This strategy
  normally maintains low fragmentation.) However, for requests less
  than 256bytes, it deviates from best-fit when there is not an
  exactly fitting available chunk by preferring to use space adjacent
  to that used for the previous small request, as well as by breaking
  ties in approximately most-recently-used order. (These enhance
  locality of series of small allocations.)  And for very large requests
  (>= 256Kb by default), it relies on system memory mapping
  facilities, if supported.  (This helps avoid carrying around and
  possibly fragmenting memory used only for large chunks.)

  All operations (except malloc_stats and mallinfo) have execution
  times that are bounded by a constant factor of the number of bits in
  a size_t, not counting any clearing in calloc or copying in realloc,
  or actions surrounding MORECORE and MMAP that have times
  proportional to the number of non-contiguous regions returned by
  system allocation routines, which is often just 1. In real-time
  applications, you can optionally suppress segment traversals using
  NO_SEGMENT_TRAVERSAL, which assures bounded execution even when
  system allocators return non-contiguous spaces, at the typical
  expense of carrying around more memory and increased fragmentation.

  The implementation is not very modular and seriously overuses
  macros. Perhaps someday all C compilers will do as good a job
  inlining modular code as can now be done by brute-force expansion,
  but now, enough of them seem not to.

  Some compilers issue a lot of warnings about code that is
  dead/unreachable only on some platforms, and also about intentional
  uses of negation on unsigned types. All known cases of each can be
  ignored.

  For a longer but out of date high-level description, see
     http://gee.cs.oswego.edu/dl/html/malloc.html

* MSPACES
  If MSPACES is defined, then in addition to malloc, free, etc.,
  this file also defines mspace_malloc, mspace_free, etc. These
  are versions of malloc routines that take an "mspace" argument
  obtained using create_mspace, to control all internal bookkeeping.
  If ONLY_MSPACES is defined, only these versions are compiled.
  So if you would like to use this allocator for only some allocations,
  and your system malloc for others, you can compile with
  ONLY_MSPACES and then do something like...
    static mspace mymspace = create_mspace(0,0); // for example
    #define mymalloc(bytes)  mspace_malloc(mymspace, bytes)

  (Note: If you only need one instance of an mspace, you can instead
  use "USE_DL_PREFIX" to relabel the global malloc.)

  You can similarly create thread-local allocators by storing
  mspaces as thread-locals. For example:
    static __thread mspace tlms = 0;
    void*  tlmalloc(size_t bytes) {
      if (tlms == 0) tlms = create_mspace(0, 0);
      return mspace_malloc(tlms, bytes);
    }
    void  tlfree(void* mem) { mspace_free(tlms, mem); }

  Unless FOOTERS is defined, each mspace is completely independent.
  You cannot allocate from one and free to another (although
  conformance is only weakly checked, so usage errors are not always
  caught). If FOOTERS is defined, then each chunk carries around a tag
  indicating its originating mspace, and frees are directed to their
  originating spaces. Normally, this requires use of locks.

 -------------------------  Compile-time options ---------------------------

Be careful in setting #define values for numerical constants of type
size_t. On some systems, literal values are not automatically extended
to size_t precision unless they are explicitly casted. You can also
use the symbolic values MAX_SIZE_T, SIZE_T_ONE, etc below.

WIN32                    default: defined if _WIN32 defined
  Defining WIN32 sets up defaults for MS environment and compilers.
  Otherwise defaults are for unix. Beware that there seem to be some
  cases where this malloc might not be a pure drop-in replacement for
  Win32 malloc: Random-looking failures from Win32 GDI API's (eg;
  SetDIBits()) may be due to bugs in some video driver implementations
  when pixel buffers are malloc()ed, and the region spans more than
  one VirtualAlloc()ed region. Because dlmalloc uses a small (64Kb)
  default granularity, pixel buffers may straddle virtual allocation
  regions more often than when using the Microsoft allocator.  You can
  avoid this by using VirtualAlloc() and VirtualFree() for all pixel
  buffers rather than using malloc().  If this is not possible,
  recompile this malloc with a larger DEFAULT_GRANULARITY. Note:
  in cases where MSC and gcc (cygwin) are known to differ on WIN32,
  conditions use _MSC_VER to distinguish them.

DLMALLOC_EXPORT       default: extern
  Defines how public APIs are declared. If you want to export via a
  Windows DLL, you might define this as
    #define DLMALLOC_EXPORT extern  __declspec(dllexport)
  If you want a POSIX ELF shared object, you might use
    #define DLMALLOC_EXPORT extern __attribute__((visibility("default")))

MALLOC_ALIGNMENT         default: (size_t)(2 * sizeof(void *))
  Controls the minimum alignment for malloc'ed chunks.  It must be a
  power of two and at least 8, even on machines for which smaller
  alignments would suffice. It may be defined as larger than this
  though. Note however that code and data structures are optimized for
  the case of 8-byte alignment.

MSPACES                  default: 0 (false)
  If true, compile in support for independent allocation spaces.
  This is only supported if HAVE_MMAP is true.

ONLY_MSPACES             default: 0 (false)
  If true, only compile in mspace versions, not regular versions.

USE_LOCKS                default: 0 (false)
  Causes each call to each public routine to be surrounded with
  pthread or WIN32 mutex lock/unlock. (If set true, this can be
  overridden on a per-mspace basis for mspace versions.) If set to a
  non-zero value other than 1, locks are used, but their
  implementation is left out, so lock functions must be supplied manually,
  as described below.

USE_SPIN_LOCKS           default: 1 iff USE_LOCKS and spin locks available
  If true, uses custom spin locks for locking. This is currently
  supported only gcc >= 4.1, older gccs on x86 platforms, and recent
  MS compilers.  Otherwise, posix locks or win32 critical sections are
  used.

USE_RECURSIVE_LOCKS      default: not defined
  If defined nonzero, uses recursive (aka reentrant) locks, otherwise
  uses plain mutexes. This is not required for malloc proper, but may
  be needed for layered allocators such as nedmalloc.

LOCK_AT_FORK            default: not defined
  If defined nonzero, performs pthread_atfork upon initialization
  to initialize child lock while holding parent lock. The implementation
  assumes that pthread locks (not custom locks) are being used. In other
  cases, you may need to customize the implementation.

FOOTERS                  default: 0
  If true, provide extra checking and dispatching by placing
  information in the footers of allocated chunks. This adds
  space and time overhead.

INSECURE                 default: 0
  If true, omit checks for usage errors and heap space overwrites.

USE_DL_PREFIX            default: NOT defined
  Causes compiler to prefix all public routines with the string 'dl'.
  This can be useful when you only want to use this malloc in one part
  of a program, using your regular system malloc elsewhere.

MALLOC_INSPECT_ALL       default: NOT defined
  If defined, compiles malloc_inspect_all and mspace_inspect_all, that
  perform traversal of all heap space.  Unless access to these
  functions is otherwise restricted, you probably do not want to
  include them in secure implementations.

ABORT                    default: defined as abort()
  Defines how to abort on failed checks.  On most systems, a failed
  check cannot die with an "assert" or even print an informative
  message, because the underlying print routines in turn call malloc,
  which will fail again.  Generally, the best policy is to simply call
  abort(). It's not very useful to do more than this because many
  errors due to overwriting will show up as address faults (null, odd
  addresses etc) rather than malloc-triggered checks, so will also
  abort.  Also, most compilers know that abort() does not return, so
  can better optimize code conditionally calling it.

PROCEED_ON_ERROR           default: defined as 0 (false)
  Controls whether detected bad addresses cause them to bypassed
  rather than aborting. If set, detected bad arguments to free and
  realloc are ignored. And all bookkeeping information is zeroed out
  upon a detected overwrite of freed heap space, thus losing the
  ability to ever return it from malloc again, but enabling the
  application to proceed. If PROCEED_ON_ERROR is defined, the
  static variable malloc_corruption_error_count is compiled in
  and can be examined to see if errors have occurred. This option
  generates slower code than the default abort policy.

DEBUG                    default: NOT defined
  The DEBUG setting is mainly intended for people trying to modify
  this code or diagnose problems when porting to new platforms.
  However, it may also be able to better isolate user errors than just
  using runtime checks.  The assertions in the check routines spell
  out in more detail the assumptions and invariants underlying the
  algorithms.  The checking is fairly extensive, and will slow down
  execution noticeably. Calling malloc_stats or mallinfo with DEBUG
  set will attempt to check every non-mmapped allocated and free chunk
  in the course of computing the summaries.

ABORT_ON_ASSERT_FAILURE   default: defined as 1 (true)
  Debugging assertion failures can be nearly impossible if your
  version of the assert macro causes malloc to be called, which will
  lead to a cascade of further failures, blowing the runtime stack.
  ABORT_ON_ASSERT_FAILURE cause assertions failures to call abort(),
  which will usually make debugging easier.

MALLOC_FAILURE_ACTION     default: sets errno to ENOMEM, or no-op on win32
  The action to take before "return 0" when malloc fails to be able to
  return memory because there is none available.

HAVE_MORECORE             default: 1 (true) unless win32 or ONLY_MSPACES
  True if this system supports sbrk or an emulation of it.

MORECORE                  default: sbrk
  The name of the sbrk-style system routine to call to obtain more
  memory.  See below for guidance on writing custom MORECORE
  functions. The type of the argument to sbrk/MORECORE varies across
  systems.  It cannot be size_t, because it supports negative
  arguments, so it is normally the signed type of the same width as
  size_t (sometimes declared as "intptr_t").  It doesn't much matter
  though. Internally, we only call it with arguments less than half
  the max value of a size_t, which should work across all reasonable
  possibilities, although sometimes generating compiler warnings.

MORECORE_CONTIGUOUS       default: 1 (true) if HAVE_MORECORE
  If true, take advantage of fact that consecutive calls to MORECORE
  with positive arguments always return contiguous increasing
  addresses.  This is true of unix sbrk. It does not hurt too much to
  set it true anyway, since malloc copes with non-contiguities.
  Setting it false when definitely non-contiguous saves time
  and possibly wasted space it would take to discover this though.

MORECORE_CANNOT_TRIM      default: NOT defined
  True if MORECORE cannot release space back to the system when given
  negative arguments. This is generally necessary only if you are
  using a hand-crafted MORECORE function that cannot handle negative
  arguments.

NO_SEGMENT_TRAVERSAL       default: 0
  If non-zero, suppresses traversals of memory segments
  returned by either MORECORE or CALL_MMAP. This disables
  merging of segments that are contiguous, and selectively
  releasing them to the OS if unused, but bounds execution times.

HAVE_MMAP                 default: 1 (true)
  True if this system supports mmap or an emulation of it.  If so, and
  HAVE_MORECORE is not true, MMAP is used for all system
  allocation. If set and HAVE_MORECORE is true as well, MMAP is
  primarily used to directly allocate very large blocks. It is also
  used as a backup strategy in cases where MORECORE fails to provide
  space from system. Note: A single call to MUNMAP is assumed to be
  able to unmap memory that may have be allocated using multiple calls
  to MMAP, so long as they are adjacent.

HAVE_MREMAP               default: 1 on linux, else 0
  If true realloc() uses mremap() to re-allocate large blocks and
  extend or shrink allocation spaces.

MMAP_CLEARS               default: 1 except on WINCE.
  True if mmap clears memory so calloc doesn't need to. This is true
  for standard unix mmap using /dev/zero and on WIN32 except for WINCE.

USE_BUILTIN_FFS            default: 0 (i.e., not used)
  Causes malloc to use the builtin ffs() function to compute indices.
  Some compilers may recognize and intrinsify ffs to be faster than the
  supplied C version. Also, the case of x86 using gcc is special-cased
  to an asm instruction, so is already as fast as it can be, and so
  this setting has no effect. Similarly for Win32 under recent MS compilers.
  (On most x86s, the asm version is only slightly faster than the C version.)

malloc_getpagesize         default: derive from system includes, or 4096.
  The system page size. To the extent possible, this malloc manages
  memory from the system in page-size units.  This may be (and
  usually is) a function rather than a constant. This is ignored
  if WIN32, where page size is determined using getSystemInfo during
  initialization.

USE_DEV_RANDOM             default: 0 (i.e., not used)
  Causes malloc to use /dev/random to initialize secure magic seed for
  stamping footers. Otherwise, the current time is used.

NO_MALLINFO                default: 0
  If defined, don't compile "mallinfo". This can be a simple way
  of dealing with mismatches between system declarations and
  those in this file.

MALLINFO_FIELD_TYPE        default: size_t
  The type of the fields in the mallinfo struct. This was originally
  defined as "int" in SVID etc, but is more usefully defined as
  size_t. The value is used only if  HAVE_USR_INCLUDE_MALLOC_H is not set

NO_MALLOC_STATS            default: 0
  If defined, don't compile "malloc_stats". This avoids calls to
  fprintf and bringing in stdio dependencies you might not want.

REALLOC_ZERO_BYTES_FREES    default: not defined
  This should be set if a call to realloc with zero bytes should
  be the same as a call to free. Some people think it should. Otherwise,
  since this malloc returns a unique pointer for malloc(0), so does
  realloc(p, 0).

LACKS_UNISTD_H, LACKS_FCNTL_H, LACKS_SYS_PARAM_H, LACKS_SYS_MMAN_H
LACKS_STRINGS_H, LACKS_STRING_H, LACKS_SYS_TYPES_H,  LACKS_ERRNO_H
LACKS_STDLIB_H LACKS_SCHED_H LACKS_TIME_H  default: NOT defined unless on WIN32
  Define these if your system does not have these header files.
  You might need to manually insert some of the declarations they provide.

DEFAULT_GRANULARITY        default: page size if MORECORE_CONTIGUOUS,
                                system_info.dwAllocationGranularity in WIN32,
                                otherwise 64K.
      Also settable using mallopt(M_GRANULARITY, x)
  The unit for allocating and deallocating memory from the system.  On
  most systems with contiguous MORECORE, there is no reason to
  make this more than a page. However, systems with MMAP tend to
  either require or encourage larger granularities.  You can increase
  this value to prevent system allocation functions to be called so
  often, especially if they are slow.  The value must be at least one
  page and must be a power of two.  Setting to 0 causes initialization
  to either page size or win32 region size.  (Note: In previous
  versions of malloc, the equivalent of this option was called
  "TOP_PAD")

DEFAULT_TRIM_THRESHOLD    default: 2MB
      Also settable using mallopt(M_TRIM_THRESHOLD, x)
  The maximum amount of unused top-most memory to keep before
  releasing via malloc_trim in free().  Automatic trimming is mainly
  useful in long-lived programs using contiguous MORECORE.  Because
  trimming via sbrk can be slow on some systems, and can sometimes be
  wasteful (in cases where programs immediately afterward allocate
  more large chunks) the value should be high enough so that your
  overall system performance would improve by releasing this much
  memory.  As a rough guide, you might set to a value close to the
  average size of a process (program) running on your system.
  Releasing this much memory would allow such a process to run in
  memory.  Generally, it is worth tuning trim thresholds when a
  program undergoes phases where several large chunks are allocated
  and released in ways that can reuse each other's storage, perhaps
  mixed with phases where there are no such chunks at all. The trim
  value must be greater than page size to have any useful effect.  To
  disable trimming completely, you can set to MAX_SIZE_T. Note that the trick
  some people use of mallocing a huge space and then freeing it at
  program startup, in an attempt to reserve system memory, doesn't
  have the intended effect under automatic trimming, since that memory
  will immediately be returned to the system.

DEFAULT_MMAP_THRESHOLD       default: 256K
      Also settable using mallopt(M_MMAP_THRESHOLD, x)
  The request size threshold for using MMAP to directly service a
  request. Requests of at least this size that cannot be allocated
  using already-existing space will be serviced via mmap.  (If enough
  normal freed space already exists it is used instead.)  Using mmap
  segregates relatively large chunks of memory so that they can be
  individually obtained and released from the host system. A request
  serviced through mmap is never reused by any other request (at least
  not directly; the system may just so happen to remap successive
  requests to the same locations).  Segregating space in this way has
  the benefits that: Mmapped space can always be individually released
  back to the system, which helps keep the system level memory demands
  of a long-lived program low.  Also, mapped memory doesn't become
  `locked' between other chunks, as can happen with normally allocated
  chunks, which means that even trimming via malloc_trim would not
  release them.  However, it has the disadvantage that the space
  cannot be reclaimed, consolidated, and then used to service later
  requests, as happens with normal chunks.  The advantages of mmap
  nearly always outweigh disadvantages for "large" chunks, but the
  value of "large" may vary across systems.  The default is an
  empirically derived value that works well in most systems. You can
  disable mmap by setting to MAX_SIZE_T.

MAX_RELEASE_CHECK_RATE   default: 4095 unless not HAVE_MMAP
  The number of consolidated frees between checks to release
  unused segments when freeing. When using non-contiguous segments,
  especially with multiple mspaces, checking only for topmost space
  doesn't always suffice to trigger trimming. To compensate for this,
  free() will, with a period of MAX_RELEASE_CHECK_RATE (or the
  current number of segments, if greater) try to release unused
  segments to the OS when freeing chunks that result in
  consolidation. The best value for this parameter is a compromise
  between slowing down frees with relatively costly checks that
  rarely trigger versus holding on to unused memory. To effectively
  disable, set to MAX_SIZE_T. This may lead to a very slight speed
  improvement at the expense of carrying around more memory.
*/
#include "RomulusLR.hpp"
//modification by Andreia
#define ONLY_MSPACES 1

/* Version identifier to allow people to support multiple versions */
#ifndef DLMALLOC_VERSION
#define DLMALLOC_VERSION 20806
#endif /* DLMALLOC_VERSION */

#ifndef DLMALLOC_EXPORT
#define DLMALLOC_EXPORT extern
#endif

#ifndef WIN32
#ifdef _WIN32
#define WIN32 1
#endif  /* _WIN32 */
#ifdef _WIN32_WCE
#define LACKS_FCNTL_H
#define WIN32 1
#endif /* _WIN32_WCE */
#endif  /* WIN32 */
#ifdef WIN32
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#include <tchar.h>
#define HAVE_MMAP 1
#define HAVE_MORECORE 0
#define LACKS_UNISTD_H
#define LACKS_SYS_PARAM_H
#define LACKS_SYS_MMAN_H
#define LACKS_STRING_H
#define LACKS_STRINGS_H
#define LACKS_SYS_TYPES_H
#define LACKS_ERRNO_H
#define LACKS_SCHED_H
#ifndef MALLOC_FAILURE_ACTION
#define MALLOC_FAILURE_ACTION
#endif /* MALLOC_FAILURE_ACTION */
#ifndef MMAP_CLEARS
#ifdef _WIN32_WCE /* WINCE reportedly does not clear */
#define MMAP_CLEARS 0
#else
#define MMAP_CLEARS 1
#endif /* _WIN32_WCE */
#endif /*MMAP_CLEARS */
#endif  /* WIN32 */

#if defined(DARWIN) || defined(_DARWIN)
/* Mac OSX docs advise not to use sbrk; it seems better to use mmap */
#ifndef HAVE_MORECORE
#define HAVE_MORECORE 0
#define HAVE_MMAP 1
/* OSX allocators provide 16 byte alignment */
#ifndef MALLOC_ALIGNMENT
#define MALLOC_ALIGNMENT ((size_t)16U)
#endif
#endif  /* HAVE_MORECORE */
#endif  /* DARWIN */

#ifndef LACKS_SYS_TYPES_H
#include <sys/types.h>  /* For size_t */
#endif  /* LACKS_SYS_TYPES_H */

/* The maximum possible size_t value has all bits set */
#define MAX_SIZE_T           (~(size_t)0)

#ifndef USE_LOCKS /* ensure true if spin or recursive locks set */
#define USE_LOCKS  ((defined(USE_SPIN_LOCKS) && USE_SPIN_LOCKS != 0) || \
                    (defined(USE_RECURSIVE_LOCKS) && USE_RECURSIVE_LOCKS != 0))
#endif /* USE_LOCKS */

#if USE_LOCKS /* Spin locks for gcc >= 4.1, older gcc on x86, MSC >= 1310 */
#if ((defined(__GNUC__) &&                                              \
      ((__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1)) ||      \
       defined(__i386__) || defined(__x86_64__))) ||                    \
     (defined(_MSC_VER) && _MSC_VER>=1310))
#ifndef USE_SPIN_LOCKS
#define USE_SPIN_LOCKS 1
#endif /* USE_SPIN_LOCKS */
#elif USE_SPIN_LOCKS
#error "USE_SPIN_LOCKS defined without implementation"
#endif /* ... locks available... */
#elif !defined(USE_SPIN_LOCKS)
#define USE_SPIN_LOCKS 0
#endif /* USE_LOCKS */

#ifndef ONLY_MSPACES
#define ONLY_MSPACES 0
#endif  /* ONLY_MSPACES */
#ifndef MSPACES
#if ONLY_MSPACES
#define MSPACES 1
#else   /* ONLY_MSPACES */
#define MSPACES 0
#endif  /* ONLY_MSPACES */
#endif  /* MSPACES */
#ifndef MALLOC_ALIGNMENT
#define MALLOC_ALIGNMENT ((size_t)(2 * sizeof(void *)))
#endif  /* MALLOC_ALIGNMENT */
#ifndef FOOTERS
#define FOOTERS 0
#endif  /* FOOTERS */
#ifndef ABORT
#define ABORT  abort()
#endif  /* ABORT */
#ifndef ABORT_ON_ASSERT_FAILURE
#define ABORT_ON_ASSERT_FAILURE 1
#endif  /* ABORT_ON_ASSERT_FAILURE */
#ifndef PROCEED_ON_ERROR
#define PROCEED_ON_ERROR 0
#endif  /* PROCEED_ON_ERROR */

#ifndef INSECURE
#define INSECURE 0
#endif  /* INSECURE */
#ifndef MALLOC_INSPECT_ALL
#define MALLOC_INSPECT_ALL 0
#endif  /* MALLOC_INSPECT_ALL */
#ifndef HAVE_MMAP
#define HAVE_MMAP 1
#endif  /* HAVE_MMAP */
#ifndef MMAP_CLEARS
#define MMAP_CLEARS 1
#endif  /* MMAP_CLEARS */
#ifndef HAVE_MREMAP
#ifdef linux
#define HAVE_MREMAP 1
#define _GNU_SOURCE /* Turns on mremap() definition */
#else   /* linux */
#define HAVE_MREMAP 0
#endif  /* linux */
#endif  /* HAVE_MREMAP */
#ifndef MALLOC_FAILURE_ACTION
#define MALLOC_FAILURE_ACTION  errno = ENOMEM;
#endif  /* MALLOC_FAILURE_ACTION */
#ifndef HAVE_MORECORE
#if ONLY_MSPACES
#define HAVE_MORECORE 0
#else   /* ONLY_MSPACES */
#define HAVE_MORECORE 1
#endif  /* ONLY_MSPACES */
#endif  /* HAVE_MORECORE */
#if !HAVE_MORECORE
#define MORECORE_CONTIGUOUS 0
#else   /* !HAVE_MORECORE */
#define MORECORE_DEFAULT sbrk
#ifndef MORECORE_CONTIGUOUS
#define MORECORE_CONTIGUOUS 1
#endif  /* MORECORE_CONTIGUOUS */
#endif  /* HAVE_MORECORE */
#ifndef DEFAULT_GRANULARITY
#if (MORECORE_CONTIGUOUS || defined(WIN32))
#define DEFAULT_GRANULARITY (0)  /* 0 means to compute in init_mparams */
#else   /* MORECORE_CONTIGUOUS */
#define DEFAULT_GRANULARITY ((size_t)64U * (size_t)1024U)
#endif  /* MORECORE_CONTIGUOUS */
#endif  /* DEFAULT_GRANULARITY */
#ifndef DEFAULT_TRIM_THRESHOLD
#ifndef MORECORE_CANNOT_TRIM
#define DEFAULT_TRIM_THRESHOLD ((size_t)2U * (size_t)1024U * (size_t)1024U)
#else   /* MORECORE_CANNOT_TRIM */
#define DEFAULT_TRIM_THRESHOLD MAX_SIZE_T
#endif  /* MORECORE_CANNOT_TRIM */
#endif  /* DEFAULT_TRIM_THRESHOLD */
#ifndef DEFAULT_MMAP_THRESHOLD
#if HAVE_MMAP
#define DEFAULT_MMAP_THRESHOLD ((size_t)256U * (size_t)1024U)
#else   /* HAVE_MMAP */
#define DEFAULT_MMAP_THRESHOLD MAX_SIZE_T
#endif  /* HAVE_MMAP */
#endif  /* DEFAULT_MMAP_THRESHOLD */
#ifndef MAX_RELEASE_CHECK_RATE
#if HAVE_MMAP
#define MAX_RELEASE_CHECK_RATE 4095
#else
#define MAX_RELEASE_CHECK_RATE MAX_SIZE_T
#endif /* HAVE_MMAP */
#endif /* MAX_RELEASE_CHECK_RATE */
#ifndef USE_BUILTIN_FFS
#define USE_BUILTIN_FFS 0
#endif  /* USE_BUILTIN_FFS */
#ifndef USE_DEV_RANDOM
#define USE_DEV_RANDOM 0
#endif  /* USE_DEV_RANDOM */
#ifndef NO_MALLINFO
#define NO_MALLINFO 0
#endif  /* NO_MALLINFO */
#ifndef MALLINFO_FIELD_TYPE
#define MALLINFO_FIELD_TYPE size_t
#endif  /* MALLINFO_FIELD_TYPE */
#ifndef NO_MALLOC_STATS
#define NO_MALLOC_STATS 0
#endif  /* NO_MALLOC_STATS */
#ifndef NO_SEGMENT_TRAVERSAL
#define NO_SEGMENT_TRAVERSAL 0
#endif /* NO_SEGMENT_TRAVERSAL */

/*
  mallopt tuning options.  SVID/XPG defines four standard parameter
  numbers for mallopt, normally defined in malloc.h.  None of these
  are used in this malloc, so setting them has no effect. But this
  malloc does support the following options.
*/

#define M_TRIM_THRESHOLD     (-1)
#define M_GRANULARITY        (-2)
#define M_MMAP_THRESHOLD     (-3)

/* ------------------------ Mallinfo declarations ------------------------ */

#if !NO_MALLINFO
/*
  This version of malloc supports the standard SVID/XPG mallinfo
  routine that returns a struct containing usage properties and
  statistics. It should work on any system that has a
  /usr/include/malloc.h defining struct mallinfo.  The main
  declaration needed is the mallinfo struct that is returned (by-copy)
  by mallinfo().  The malloinfo struct contains a bunch of fields that
  are not even meaningful in this version of malloc.  These fields are
  are instead filled by mallinfo() with other numbers that might be of
  interest.

  HAVE_USR_INCLUDE_MALLOC_H should be set if you have a
  /usr/include/malloc.h file that includes a declaration of struct
  mallinfo.  If so, it is included; else a compliant version is
  declared below.  These must be precisely the same for mallinfo() to
  work.  The original SVID version of this struct, defined on most
  systems with mallinfo, declares all fields as ints. But some others
  define as unsigned long. If your system defines the fields using a
  type of different width than listed here, you MUST #include your
  system version and #define HAVE_USR_INCLUDE_MALLOC_H.
*/

/* #define HAVE_USR_INCLUDE_MALLOC_H */

#ifdef HAVE_USR_INCLUDE_MALLOC_H
#include "/usr/include/malloc.h"
#else /* HAVE_USR_INCLUDE_MALLOC_H */
#ifndef STRUCT_MALLINFO_DECLARED
/* HP-UX (and others?) redefines mallinfo unless _STRUCT_MALLINFO is defined */
#define _STRUCT_MALLINFO
#define STRUCT_MALLINFO_DECLARED 1
struct mallinfo {
  MALLINFO_FIELD_TYPE arena;    /* non-mmapped space allocated from system */
  MALLINFO_FIELD_TYPE ordblks;  /* number of free chunks */
  MALLINFO_FIELD_TYPE smblks;   /* always 0 */
  MALLINFO_FIELD_TYPE hblks;    /* always 0 */
  MALLINFO_FIELD_TYPE hblkhd;   /* space in mmapped regions */
  MALLINFO_FIELD_TYPE usmblks;  /* maximum total allocated space */
  MALLINFO_FIELD_TYPE fsmblks;  /* always 0 */
  MALLINFO_FIELD_TYPE uordblks; /* total allocated space */
  MALLINFO_FIELD_TYPE fordblks; /* total free space */
  MALLINFO_FIELD_TYPE keepcost; /* releasable (via malloc_trim) space */
};
#endif /* STRUCT_MALLINFO_DECLARED */
#endif /* HAVE_USR_INCLUDE_MALLOC_H */
#endif /* NO_MALLINFO */

/*
  Try to persuade compilers to inline. The most critical functions for
  inlining are defined as macros, so these aren't used for them.
*/

#ifndef FORCEINLINE
  #if defined(__GNUC__)
#define FORCEINLINE __inline __attribute__ ((always_inline))
  #elif defined(_MSC_VER)
    #define FORCEINLINE __forceinline
  #endif
#endif
#ifndef NOINLINE
  #if defined(__GNUC__)
    #define NOINLINE __attribute__ ((noinline))
  #elif defined(_MSC_VER)
    #define NOINLINE __declspec(noinline)
  #else
    #define NOINLINE
  #endif
#endif

#ifdef __cplusplus
extern "C" {
#ifndef FORCEINLINE
 #define FORCEINLINE inline
#endif
#endif /* __cplusplus */
#ifndef FORCEINLINE
 #define FORCEINLINE
#endif

#if !ONLY_MSPACES

/* ------------------- Declarations of public routines ------------------- */

#ifndef USE_DL_PREFIX
#define dlcalloc               calloc
#define dlfree                 free
#define dlmalloc               malloc
#define dlmemalign             memalign
#define dlposix_memalign       posix_memalign
#define dlrealloc              realloc
#define dlrealloc_in_place     realloc_in_place
#define dlvalloc               valloc
#define dlpvalloc              pvalloc
#define dlmallinfo             mallinfo
#define dlmallopt              mallopt
#define dlmalloc_trim          malloc_trim
#define dlmalloc_stats         malloc_stats
#define dlmalloc_usable_size   malloc_usable_size
#define dlmalloc_footprint     malloc_footprint
#define dlmalloc_max_footprint malloc_max_footprint
#define dlmalloc_footprint_limit malloc_footprint_limit
#define dlmalloc_set_footprint_limit malloc_set_footprint_limit
#define dlmalloc_inspect_all   malloc_inspect_all
#define dlindependent_calloc   independent_calloc
#define dlindependent_comalloc independent_comalloc
#define dlbulk_free            bulk_free
#endif /* USE_DL_PREFIX */
/*
  malloc(size_t n)
  Returns a pointer to a newly allocated chunk of at least n bytes, or
  null if no space is available, in which case errno is set to ENOMEM
  on ANSI C systems.

  If n is zero, malloc returns a minimum-sized chunk. (The minimum
  size is 16 bytes on most 32bit systems, and 32 bytes on 64bit
  systems.)  Note that size_t is an unsigned type, so calls with
  arguments that would be negative if signed are interpreted as
  requests for huge amounts of space, which will often fail. The
  maximum supported value of n differs across systems, but is in all
  cases less than the maximum representable value of a size_t.
*/
DLMALLOC_EXPORT void* dlmalloc(size_t);

/*
  free(void* p)
  Releases the chunk of memory pointed to by p, that had been previously
  allocated using malloc or a related routine such as realloc.
  It has no effect if p is null. If p was not malloced or already
  freed, free(p) will by default cause the current program to abort.
*/
DLMALLOC_EXPORT void  dlfree(void*);

/*
  calloc(size_t n_elements, size_t element_size);
  Returns a pointer to n_elements * element_size bytes, with all locations
  set to zero.
*/
DLMALLOC_EXPORT void* dlcalloc(size_t, size_t);

/*
  realloc(void* p, size_t n)
  Returns a pointer to a chunk of size n that contains the same data
  as does chunk p up to the minimum of (n, p's size) bytes, or null
  if no space is available.

  The returned pointer may or may not be the same as p. The algorithm
  prefers extending p in most cases when possible, otherwise it
  employs the equivalent of a malloc-copy-free sequence.

  If p is null, realloc is equivalent to malloc.

  If space is not available, realloc returns null, errno is set (if on
  ANSI) and p is NOT freed.

  if n is for fewer bytes than already held by p, the newly unused
  space is lopped off and freed if possible.  realloc with a size
  argument of zero (re)allocates a minimum-sized chunk.

  The old unix realloc convention of allowing the last-free'd chunk
  to be used as an argument to realloc is not supported.
*/
DLMALLOC_EXPORT void* dlrealloc(void*, size_t);

/*
  realloc_in_place(void* p, size_t n)
  Resizes the space allocated for p to size n, only if this can be
  done without moving p (i.e., only if there is adjacent space
  available if n is greater than p's current allocated size, or n is
  less than or equal to p's size). This may be used instead of plain
  realloc if an alternative allocation strategy is needed upon failure
  to expand space; for example, reallocation of a buffer that must be
  memory-aligned or cleared. You can use realloc_in_place to trigger
  these alternatives only when needed.

  Returns p if successful; otherwise null.
*/
DLMALLOC_EXPORT void* dlrealloc_in_place(void*, size_t);

/*
  memalign(size_t alignment, size_t n);
  Returns a pointer to a newly allocated chunk of n bytes, aligned
  in accord with the alignment argument.

  The alignment argument should be a power of two. If the argument is
  not a power of two, the nearest greater power is used.
  8-byte alignment is guaranteed by normal malloc calls, so don't
  bother calling memalign with an argument of 8 or less.

  Overreliance on memalign is a sure way to fragment space.
*/
DLMALLOC_EXPORT void* dlmemalign(size_t, size_t);

/*
  int posix_memalign(void** pp, size_t alignment, size_t n);
  Allocates a chunk of n bytes, aligned in accord with the alignment
  argument. Differs from memalign only in that it (1) assigns the
  allocated memory to *pp rather than returning it, (2) fails and
  returns EINVAL if the alignment is not a power of two (3) fails and
  returns ENOMEM if memory cannot be allocated.
*/
DLMALLOC_EXPORT int dlposix_memalign(void**, size_t, size_t);

/*
  valloc(size_t n);
  Equivalent to memalign(pagesize, n), where pagesize is the page
  size of the system. If the pagesize is unknown, 4096 is used.
*/
DLMALLOC_EXPORT void* dlvalloc(size_t);

/*
  mallopt(int parameter_number, int parameter_value)
  Sets tunable parameters The format is to provide a
  (parameter-number, parameter-value) pair.  mallopt then sets the
  corresponding parameter to the argument value if it can (i.e., so
  long as the value is meaningful), and returns 1 if successful else
  0.  To workaround the fact that mallopt is specified to use int,
  not size_t parameters, the value -1 is specially treated as the
  maximum unsigned size_t value.

  SVID/XPG/ANSI defines four standard param numbers for mallopt,
  normally defined in malloc.h.  None of these are use in this malloc,
  so setting them has no effect. But this malloc also supports other
  options in mallopt. See below for details.  Briefly, supported
  parameters are as follows (listed defaults are for "typical"
  configurations).

  Symbol            param #  default    allowed param values
  M_TRIM_THRESHOLD     -1   2*1024*1024   any   (-1 disables)
  M_GRANULARITY        -2     page size   any power of 2 >= page size
  M_MMAP_THRESHOLD     -3      256*1024   any   (or 0 if no MMAP support)
*/
DLMALLOC_EXPORT int dlmallopt(int, int);

/*
  malloc_footprint();
  Returns the number of bytes obtained from the system.  The total
  number of bytes allocated by malloc, realloc etc., is less than this
  value. Unlike mallinfo, this function returns only a precomputed
  result, so can be called frequently to monitor memory consumption.
  Even if locks are otherwise defined, this function does not use them,
  so results might not be up to date.
*/
DLMALLOC_EXPORT size_t dlmalloc_footprint(void);

/*
  malloc_max_footprint();
  Returns the maximum number of bytes obtained from the system. This
  value will be greater than current footprint if deallocated space
  has been reclaimed by the system. The peak number of bytes allocated
  by malloc, realloc etc., is less than this value. Unlike mallinfo,
  this function returns only a precomputed result, so can be called
  frequently to monitor memory consumption.  Even if locks are
  otherwise defined, this function does not use them, so results might
  not be up to date.
*/
DLMALLOC_EXPORT size_t dlmalloc_max_footprint(void);

/*
  malloc_footprint_limit();
  Returns the number of bytes that the heap is allowed to obtain from
  the system, returning the last value returned by
  malloc_set_footprint_limit, or the maximum size_t value if
  never set. The returned value reflects a permission. There is no
  guarantee that this number of bytes can actually be obtained from
  the system.
*/
DLMALLOC_EXPORT size_t dlmalloc_footprint_limit();

/*
  malloc_set_footprint_limit();
  Sets the maximum number of bytes to obtain from the system, causing
  failure returns from malloc and related functions upon attempts to
  exceed this value. The argument value may be subject to page
  rounding to an enforceable limit; this actual value is returned.
  Using an argument of the maximum possible size_t effectively
  disables checks. If the argument is less than or equal to the
  current malloc_footprint, then all future allocations that require
  additional system memory will fail. However, invocation cannot
  retroactively deallocate existing used memory.
*/
DLMALLOC_EXPORT size_t dlmalloc_set_footprint_limit(size_t bytes);

#if MALLOC_INSPECT_ALL
/*
  malloc_inspect_all(void(*handler)(void *start,
                                    void *end,
                                    size_t used_bytes,
                                    void* callback_arg),
                      void* arg);
  Traverses the heap and calls the given handler for each managed
  region, skipping all bytes that are (or may be) used for bookkeeping
  purposes.  Traversal does not include include chunks that have been
  directly memory mapped. Each reported region begins at the start
  address, and continues up to but not including the end address.  The
  first used_bytes of the region contain allocated data. If
  used_bytes is zero, the region is unallocated. The handler is
  invoked with the given callback argument. If locks are defined, they
  are held during the entire traversal. It is a bad idea to invoke
  other malloc functions from within the handler.

  For example, to count the number of in-use chunks with size greater
  than 1000, you could write:
  static int count = 0;
  void count_chunks(void* start, void* end, size_t used, void* arg) {
    if (used >= 1000) ++count;
  }
  then:
    malloc_inspect_all(count_chunks, NULL);

  malloc_inspect_all is compiled only if MALLOC_INSPECT_ALL is defined.
*/
DLMALLOC_EXPORT void dlmalloc_inspect_all(void(*handler)(void*, void *, size_t, void*),
                           void* arg);

#endif /* MALLOC_INSPECT_ALL */

#if !NO_MALLINFO
/*
  mallinfo()
  Returns (by copy) a struct containing various summary statistics:

  arena:     current total non-mmapped bytes allocated from system
  ordblks:   the number of free chunks
  smblks:    always zero.
  hblks:     current number of mmapped regions
  hblkhd:    total bytes held in mmapped regions
  usmblks:   the maximum total allocated space. This will be greater
                than current total if trimming has occurred.
  fsmblks:   always zero
  uordblks:  current total allocated space (normal or mmapped)
  fordblks:  total free space
  keepcost:  the maximum number of bytes that could ideally be released
               back to system via malloc_trim. ("ideally" means that
               it ignores page restrictions etc.)

  Because these fields are ints, but internal bookkeeping may
  be kept as longs, the reported values may wrap around zero and
  thus be inaccurate.
*/
DLMALLOC_EXPORT struct mallinfo dlmallinfo(void);
#endif /* NO_MALLINFO */

/*
  independent_calloc(size_t n_elements, size_t element_size, void* chunks[]);

  independent_calloc is similar to calloc, but instead of returning a
  single cleared space, it returns an array of pointers to n_elements
  independent elements that can hold contents of size elem_size, each
  of which starts out cleared, and can be independently freed,
  realloc'ed etc. The elements are guaranteed to be adjacently
  allocated (this is not guaranteed to occur with multiple callocs or
  mallocs), which may also improve cache locality in some
  applications.

  The "chunks" argument is optional (i.e., may be null, which is
  probably the most typical usage). If it is null, the returned array
  is itself dynamically allocated and should also be freed when it is
  no longer needed. Otherwise, the chunks array must be of at least
  n_elements in length. It is filled in with the pointers to the
  chunks.

  In either case, independent_calloc returns this pointer array, or
  null if the allocation failed.  If n_elements is zero and "chunks"
  is null, it returns a chunk representing an array with zero elements
  (which should be freed if not wanted).

  Each element must be freed when it is no longer needed. This can be
  done all at once using bulk_free.

  independent_calloc simplifies and speeds up implementations of many
  kinds of pools.  It may also be useful when constructing large data
  structures that initially have a fixed number of fixed-sized nodes,
  but the number is not known at compile time, and some of the nodes
  may later need to be freed. For example:

  struct Node { int item; struct Node* next; };

  struct Node* build_list() {
    struct Node** pool;
    int n = read_number_of_nodes_needed();
    if (n <= 0) return 0;
    pool = (struct Node**)(independent_calloc(n, sizeof(struct Node), 0);
    if (pool == 0) die();
    // organize into a linked list...
    struct Node* first = pool[0];
    for (i = 0; i < n-1; ++i)
      pool[i]->next = pool[i+1];
    free(pool);     // Can now free the array (or not, if it is needed later)
    return first;
  }
*/
DLMALLOC_EXPORT void** dlindependent_calloc(size_t, size_t, void**);

/*
  independent_comalloc(size_t n_elements, size_t sizes[], void* chunks[]);

  independent_comalloc allocates, all at once, a set of n_elements
  chunks with sizes indicated in the "sizes" array.    It returns
  an array of pointers to these elements, each of which can be
  independently freed, realloc'ed etc. The elements are guaranteed to
  be adjacently allocated (this is not guaranteed to occur with
  multiple callocs or mallocs), which may also improve cache locality
  in some applications.

  The "chunks" argument is optional (i.e., may be null). If it is null
  the returned array is itself dynamically allocated and should also
  be freed when it is no longer needed. Otherwise, the chunks array
  must be of at least n_elements in length. It is filled in with the
  pointers to the chunks.

  In either case, independent_comalloc returns this pointer array, or
  null if the allocation failed.  If n_elements is zero and chunks is
  null, it returns a chunk representing an array with zero elements
  (which should be freed if not wanted).

  Each element must be freed when it is no longer needed. This can be
  done all at once using bulk_free.

  independent_comallac differs from independent_calloc in that each
  element may have a different size, and also that it does not
  automatically clear elements.

  independent_comalloc can be used to speed up allocation in cases
  where several structs or objects must always be allocated at the
  same time.  For example:

  struct Head { ... }
  struct Foot { ... }

  void send_message(char* msg) {
    int msglen = strlen(msg);
    size_t sizes[3] = { sizeof(struct Head), msglen, sizeof(struct Foot) };
    void* chunks[3];
    if (independent_comalloc(3, sizes, chunks) == 0)
      die();
    struct Head* head = (struct Head*)(chunks[0]);
    char*        body = (char*)(chunks[1]);
    struct Foot* foot = (struct Foot*)(chunks[2]);
    // ...
  }

  In general though, independent_comalloc is worth using only for
  larger values of n_elements. For small values, you probably won't
  detect enough difference from series of malloc calls to bother.

  Overuse of independent_comalloc can increase overall memory usage,
  since it cannot reuse existing noncontiguous small chunks that
  might be available for some of the elements.
*/
DLMALLOC_EXPORT void** dlindependent_comalloc(size_t, size_t*, void**);

/*
  bulk_free(void* array[], size_t n_elements)
  Frees and clears (sets to null) each non-null pointer in the given
  array.  This is likely to be faster than freeing them one-by-one.
  If footers are used, pointers that have been allocated in different
  mspaces are not freed or cleared, and the count of all such pointers
  is returned.  For large arrays of pointers with poor locality, it
  may be worthwhile to sort this array before calling bulk_free.
*/
DLMALLOC_EXPORT size_t  dlbulk_free(void**, size_t n_elements);

/*
  pvalloc(size_t n);
  Equivalent to valloc(minimum-page-that-holds(n)), that is,
  round up n to nearest pagesize.
 */
DLMALLOC_EXPORT void*  dlpvalloc(size_t);

/*
  malloc_trim(size_t pad);

  If possible, gives memory back to the system (via negative arguments
  to sbrk) if there is unused memory at the `high' end of the malloc
  pool or in unused MMAP segments. You can call this after freeing
  large blocks of memory to potentially reduce the system-level memory
  requirements of a program. However, it cannot guarantee to reduce
  memory. Under some allocation patterns, some large free blocks of
  memory will be locked between two used chunks, so they cannot be
  given back to the system.

  The `pad' argument to malloc_trim represents the amount of free
  trailing space to leave untrimmed. If this argument is zero, only
  the minimum amount of memory to maintain internal data structures
  will be left. Non-zero arguments can be supplied to maintain enough
  trailing space to service future expected allocations without having
  to re-obtain memory from the system.

  Malloc_trim returns 1 if it actually released any memory, else 0.
*/
DLMALLOC_EXPORT int  dlmalloc_trim(size_t);

/*
  malloc_stats();
  Prints on stderr the amount of space obtained from the system (both
  via sbrk and mmap), the maximum amount (which may be more than
  current if malloc_trim and/or munmap got called), and the current
  number of bytes allocated via malloc (or realloc, etc) but not yet
  freed. Note that this is the number of bytes allocated, not the
  number requested. It will be larger than the number requested
  because of alignment and bookkeeping overhead. Because it includes
  alignment wastage as being in use, this figure may be greater than
  zero even when no user-level chunks are allocated.

  The reported current and maximum system memory can be inaccurate if
  a program makes other calls to system memory allocation functions
  (normally sbrk) outside of malloc.

  malloc_stats prints only the most commonly interesting statistics.
  More information can be obtained by calling mallinfo.
*/
DLMALLOC_EXPORT void  dlmalloc_stats(void);

/*
  malloc_usable_size(void* p);

  Returns the number of bytes you can actually use in
  an allocated chunk, which may be more than you requested (although
  often not) due to alignment and minimum size constraints.
  You can use this many bytes without worrying about
  overwriting other allocated objects. This is not a particularly great
  programming practice. malloc_usable_size can be more useful in
  debugging and assertions, for example:

  p = malloc(n);
  assert(malloc_usable_size(p) >= 256);
*/
size_t dlmalloc_usable_size(void*);

#endif /* ONLY_MSPACES */

#if MSPACES

/*
  mspace is an opaque type representing an independent
  region of space that supports mspace_malloc, etc.
*/
typedef void* mspace;

/*
  create_mspace creates and returns a new independent space with the
  given initial capacity, or, if 0, the default granularity size.  It
  returns null if there is no system memory available to create the
  space.  If argument locked is non-zero, the space uses a separate
  lock to control access. The capacity of the space will grow
  dynamically as needed to service mspace_malloc requests.  You can
  control the sizes of incremental increases of this space by
  compiling with a different DEFAULT_GRANULARITY or dynamically
  setting with mallopt(M_GRANULARITY, value).
*/
DLMALLOC_EXPORT mspace create_mspace(size_t capacity, int locked);

/*
  destroy_mspace destroys the given space, and attempts to return all
  of its memory back to the system, returning the total number of
  bytes freed. After destruction, the results of access to all memory
  used by the space become undefined.
*/
DLMALLOC_EXPORT size_t destroy_mspace(mspace msp);

/*
  create_mspace_with_base uses the memory supplied as the initial base
  of a new mspace. Part (less than 128*sizeof(size_t) bytes) of this
  space is used for bookkeeping, so the capacity must be at least this
  large. (Otherwise 0 is returned.) When this initial space is
  exhausted, additional memory will be obtained from the system.
  Destroying this space will deallocate all additionally allocated
  space (if possible) but not the initial base.
*/
DLMALLOC_EXPORT mspace create_mspace_with_base(void* base, size_t capacity, int locked);

/*
  mspace_track_large_chunks controls whether requests for large chunks
  are allocated in their own untracked mmapped regions, separate from
  others in this mspace. By default large chunks are not tracked,
  which reduces fragmentation. However, such chunks are not
  necessarily released to the system upon destroy_mspace.  Enabling
  tracking by setting to true may increase fragmentation, but avoids
  leakage when relying on destroy_mspace to release all memory
  allocated using this space.  The function returns the previous
  setting.
*/
DLMALLOC_EXPORT int mspace_track_large_chunks(mspace msp, int enable);


/*
  mspace_malloc behaves as malloc, but operates within
  the given space.
*/
DLMALLOC_EXPORT void* mspace_malloc(mspace msp, size_t bytes);

/*
  mspace_free behaves as free, but operates within
  the given space.

  If compiled with FOOTERS==1, mspace_free is not actually needed.
  free may be called instead of mspace_free because freed chunks from
  any space are handled by their originating spaces.
*/
DLMALLOC_EXPORT void mspace_free(mspace msp, void* mem);

/*
  mspace_realloc behaves as realloc, but operates within
  the given space.

  If compiled with FOOTERS==1, mspace_realloc is not actually
  needed.  realloc may be called instead of mspace_realloc because
  realloced chunks from any space are handled by their originating
  spaces.
*/
DLMALLOC_EXPORT void* mspace_realloc(mspace msp, void* mem, size_t newsize);

/*
  mspace_calloc behaves as calloc, but operates within
  the given space.
*/
DLMALLOC_EXPORT void* mspace_calloc(mspace msp, size_t n_elements, size_t elem_size);

/*
  mspace_memalign behaves as memalign, but operates within
  the given space.
*/
DLMALLOC_EXPORT void* mspace_memalign(mspace msp, size_t alignment, size_t bytes);

/*
  mspace_independent_calloc behaves as independent_calloc, but
  operates within the given space.
*/
DLMALLOC_EXPORT void** mspace_independent_calloc(mspace msp, size_t n_elements,
                                 size_t elem_size, void* chunks[]);

/*
  mspace_independent_comalloc behaves as independent_comalloc, but
  operates within the given space.
*/
DLMALLOC_EXPORT void** mspace_independent_comalloc(mspace msp, size_t n_elements,
                                   size_t sizes[], void* chunks[]);

/*
  mspace_footprint() returns the number of bytes obtained from the
  system for this space.
*/
DLMALLOC_EXPORT size_t mspace_footprint(mspace msp);

/*
  mspace_max_footprint() returns the peak number of bytes obtained from the
  system for this space.
*/
DLMALLOC_EXPORT size_t mspace_max_footprint(mspace msp);


#if !NO_MALLINFO
/*
  mspace_mallinfo behaves as mallinfo, but reports properties of
  the given space.
*/
DLMALLOC_EXPORT struct mallinfo mspace_mallinfo(mspace msp);
#endif /* NO_MALLINFO */

/*
  malloc_usable_size(void* p) behaves the same as malloc_usable_size;
*/
DLMALLOC_EXPORT size_t mspace_usable_size(const void* mem);

/*
  mspace_malloc_stats behaves as malloc_stats, but reports
  properties of the given space.
*/
DLMALLOC_EXPORT void mspace_malloc_stats(mspace msp);

/*
  mspace_trim behaves as malloc_trim, but
  operates within the given space.
*/
DLMALLOC_EXPORT int mspace_trim(mspace msp, size_t pad);

/*
  An alias for mallopt.
*/
DLMALLOC_EXPORT int mspace_mallopt(int, int);

#endif /* MSPACES */

#ifdef __cplusplus
}  /* end of extern "C" */
#endif /* __cplusplus */

/*
  ========================================================================
  To make a fully customizable malloc.h header file, cut everything
  above this line, put into file malloc.h, edit to suit, and #include it
  on the next line, as well as in programs that use this malloc.
  ========================================================================
*/

/* #include "malloc.h" */

/*------------------------------ internal #includes ---------------------- */

#ifdef _MSC_VER
#pragma warning( disable : 4146 ) /* no "unsigned" warnings */
#endif /* _MSC_VER */
#if !NO_MALLOC_STATS
#include <stdio.h>       /* for printing in malloc_stats */
#endif /* NO_MALLOC_STATS */
#ifndef LACKS_ERRNO_H
#include <errno.h>       /* for MALLOC_FAILURE_ACTION */
#endif /* LACKS_ERRNO_H */
#ifdef DEBUG
#if ABORT_ON_ASSERT_FAILURE
#undef assert
#define assert(x) if(!(x)) ABORT
#else /* ABORT_ON_ASSERT_FAILURE */
#include <assert.h>
#endif /* ABORT_ON_ASSERT_FAILURE */
#else  /* DEBUG */
#ifndef assert
#define assert(x)
#endif
#define DEBUG 0
#endif /* DEBUG */
#if !defined(WIN32) && !defined(LACKS_TIME_H)
#include <time.h>        /* for magic initialization */
#endif /* WIN32 */
#ifndef LACKS_STDLIB_H
#include <stdlib.h>      /* for abort() */
#endif /* LACKS_STDLIB_H */
#ifndef LACKS_STRING_H
#include <string.h>      /* for memset etc */
#endif  /* LACKS_STRING_H */
#if USE_BUILTIN_FFS
#ifndef LACKS_STRINGS_H
#include <strings.h>     /* for ffs */
#endif /* LACKS_STRINGS_H */
#endif /* USE_BUILTIN_FFS */
#if HAVE_MMAP
#ifndef LACKS_SYS_MMAN_H
/* On some versions of linux, mremap decl in mman.h needs __USE_GNU set */
#if (defined(linux) && !defined(__USE_GNU))
#define __USE_GNU 1
#include <sys/mman.h>    /* for mmap */
#undef __USE_GNU
#else
#include <sys/mman.h>    /* for mmap */
#endif /* linux */
#endif /* LACKS_SYS_MMAN_H */
#ifndef LACKS_FCNTL_H
#include <fcntl.h>
#endif /* LACKS_FCNTL_H */
#endif /* HAVE_MMAP */
#ifndef LACKS_UNISTD_H
#include <unistd.h>     /* for sbrk, sysconf */
#else /* LACKS_UNISTD_H */
#if !defined(__FreeBSD__) && !defined(__OpenBSD__) && !defined(__NetBSD__)
extern void*     sbrk(ptrdiff_t);
#endif /* FreeBSD etc */
#endif /* LACKS_UNISTD_H */

/* Declarations for locking */
#if USE_LOCKS
#ifndef WIN32
#if defined (__SVR4) && defined (__sun)  /* solaris */
#include <thread.h>
#elif !defined(LACKS_SCHED_H)
#include <sched.h>
#endif /* solaris or LACKS_SCHED_H */
#if (defined(USE_RECURSIVE_LOCKS) && USE_RECURSIVE_LOCKS != 0) || !USE_SPIN_LOCKS
#include <pthread.h>
#endif /* USE_RECURSIVE_LOCKS ... */
#elif defined(_MSC_VER)
#ifndef _M_AMD64
/* These are already defined on AMD64 builds */
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
LONG __cdecl _InterlockedCompareExchange(LONG volatile *Dest, LONG Exchange, LONG Comp);
LONG __cdecl _InterlockedExchange(LONG volatile *Target, LONG Value);
#ifdef __cplusplus
}
#endif /* __cplusplus */
#endif /* _M_AMD64 */
#pragma intrinsic (_InterlockedCompareExchange)
#pragma intrinsic (_InterlockedExchange)
#define interlockedcompareexchange _InterlockedCompareExchange
#define interlockedexchange _InterlockedExchange
#elif defined(WIN32) && defined(__GNUC__)
#define interlockedcompareexchange(a, b, c) __sync_val_compare_and_swap(a, c, b)
#define interlockedexchange __sync_lock_test_and_set
#endif /* Win32 */
#else /* USE_LOCKS */
#endif /* USE_LOCKS */

#ifndef LOCK_AT_FORK
#define LOCK_AT_FORK 0
#endif

/* Declarations for bit scanning on win32 */
#if defined(_MSC_VER) && _MSC_VER>=1300
#ifndef BitScanForward /* Try to avoid pulling in WinNT.h */
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
unsigned char _BitScanForward(unsigned long *index, unsigned long mask);
unsigned char _BitScanReverse(unsigned long *index, unsigned long mask);
#ifdef __cplusplus
}
#endif /* __cplusplus */

#define BitScanForward _BitScanForward
#define BitScanReverse _BitScanReverse
#pragma intrinsic(_BitScanForward)
#pragma intrinsic(_BitScanReverse)
#endif /* BitScanForward */
#endif /* defined(_MSC_VER) && _MSC_VER>=1300 */

#ifndef WIN32
#ifndef malloc_getpagesize
#  ifdef _SC_PAGESIZE         /* some SVR4 systems omit an underscore */
#    ifndef _SC_PAGE_SIZE
#      define _SC_PAGE_SIZE _SC_PAGESIZE
#    endif
#  endif
#  ifdef _SC_PAGE_SIZE
#    define malloc_getpagesize sysconf(_SC_PAGE_SIZE)
#  else
#    if defined(BSD) || defined(DGUX) || defined(HAVE_GETPAGESIZE)
       extern size_t getpagesize();
#      define malloc_getpagesize getpagesize()
#    else
#      ifdef WIN32 /* use supplied emulation of getpagesize */
#        define malloc_getpagesize getpagesize()
#      else
#        ifndef LACKS_SYS_PARAM_H
#          include <sys/param.h>
#        endif
#        ifdef EXEC_PAGESIZE
#          define malloc_getpagesize EXEC_PAGESIZE
#        else
#          ifdef NBPG
#            ifndef CLSIZE
#              define malloc_getpagesize NBPG
#            else
#              define malloc_getpagesize (NBPG * CLSIZE)
#            endif
#          else
#            ifdef NBPC
#              define malloc_getpagesize NBPC
#            else
#              ifdef PAGESIZE
#                define malloc_getpagesize PAGESIZE
#              else /* just guess */
#                define malloc_getpagesize ((size_t)4096U)
#              endif
#            endif
#          endif
#        endif
#      endif
#    endif
#  endif
#endif
#endif

/* ------------------- size_t and alignment properties -------------------- */
namespace romuluslr{
/* The byte and bit size of a size_t */
#define SIZE_T_SIZE         (sizeof(size_t))
#define SIZE_T_BITSIZE      (sizeof(size_t) << 3)

/* Some constants coerced to size_t */
/* Annoying but necessary to avoid errors on some platforms */
#define SIZE_T_ZERO         ((size_t)0)
#define SIZE_T_ONE          ((size_t)1)
#define SIZE_T_TWO          ((size_t)2)
#define SIZE_T_FOUR         ((size_t)4)
#define TWO_SIZE_T_SIZES    (SIZE_T_SIZE<<1)
#define FOUR_SIZE_T_SIZES   (SIZE_T_SIZE<<2)
#define SIX_SIZE_T_SIZES    (FOUR_SIZE_T_SIZES+TWO_SIZE_T_SIZES)
#define HALF_MAX_SIZE_T     (MAX_SIZE_T / 2U)

/* The bit mask value corresponding to MALLOC_ALIGNMENT */
#define CHUNK_ALIGN_MASK    (MALLOC_ALIGNMENT - SIZE_T_ONE)

/* True if address a has acceptable alignment */
#define is_aligned(A)       (((size_t)((A)) & (CHUNK_ALIGN_MASK)) == 0)

/* the number of bytes to offset an address to align it */
#define align_offset(A)\
 ((((size_t)(A) & CHUNK_ALIGN_MASK) == 0)? 0 :\
  ((MALLOC_ALIGNMENT - ((size_t)(A) & CHUNK_ALIGN_MASK)) & CHUNK_ALIGN_MASK))

/* -------------------------- MMAP preliminaries ------------------------- */

/*
   If HAVE_MORECORE or HAVE_MMAP are false, we just define calls and
   checks to fail so compiler optimizer can delete code rather than
   using so many "#if"s.
*/


/* MORECORE and MMAP must return MFAIL on failure */
#define MFAIL                ((void*)(MAX_SIZE_T))
#define CMFAIL               ((char*)(MFAIL)) /* defined for convenience */

#if HAVE_MMAP

#ifndef WIN32
#define MUNMAP_DEFAULT(a, s)  munmap((a), (s))
#define MMAP_PROT            (PROT_READ|PROT_WRITE)
#if !defined(MAP_ANONYMOUS) && defined(MAP_ANON)
#define MAP_ANONYMOUS        MAP_ANON
#endif /* MAP_ANON */
#ifdef MAP_ANONYMOUS
#define MMAP_FLAGS           (MAP_PRIVATE|MAP_ANONYMOUS)
#define MMAP_DEFAULT(s)       mmap(0, (s), MMAP_PROT, MMAP_FLAGS, -1, 0)
#else /* MAP_ANONYMOUS */
/*
   Nearly all versions of mmap support MAP_ANONYMOUS, so the following
   is unlikely to be needed, but is supplied just in case.
*/
#define MMAP_FLAGS           (MAP_PRIVATE)
static int dev_zero_fd = -1; /* Cached file descriptor for /dev/zero. */
#define MMAP_DEFAULT(s) ((dev_zero_fd < 0) ? \
           (dev_zero_fd = open("/dev/zero", O_RDWR), \
            mmap(0, (s), MMAP_PROT, MMAP_FLAGS, dev_zero_fd, 0)) : \
            mmap(0, (s), MMAP_PROT, MMAP_FLAGS, dev_zero_fd, 0))
#endif /* MAP_ANONYMOUS */

#define DIRECT_MMAP_DEFAULT(s) MMAP_DEFAULT(s)

#else /* WIN32 */

/* Win32 MMAP via VirtualAlloc */
static FORCEINLINE void* win32mmap(size_t size) {
  void* ptr = VirtualAlloc(0, size, MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE);
  return (ptr != 0)? ptr: MFAIL;
}

/* For direct MMAP, use MEM_TOP_DOWN to minimize interference */
static FORCEINLINE void* win32direct_mmap(size_t size) {
  void* ptr = VirtualAlloc(0, size, MEM_RESERVE|MEM_COMMIT|MEM_TOP_DOWN,
                           PAGE_READWRITE);
  return (ptr != 0)? ptr: MFAIL;
}

/* This function supports releasing coalesed segments */
static FORCEINLINE int win32munmap(void* ptr, size_t size) {
  MEMORY_BASIC_INFORMATION minfo;
  char* cptr = (char*)ptr;
  while (size) {
    if (VirtualQuery(cptr, &minfo, sizeof(minfo)) == 0)
      return -1;
    if (minfo.BaseAddress != cptr || minfo.AllocationBase != cptr ||
        minfo.State != MEM_COMMIT || minfo.RegionSize > size)
      return -1;
    if (VirtualFree(cptr, 0, MEM_RELEASE) == 0)
      return -1;
    cptr += minfo.RegionSize;
    size -= minfo.RegionSize;
  }
  return 0;
}

#define MMAP_DEFAULT(s)             win32mmap(s)
#define MUNMAP_DEFAULT(a, s)        win32munmap((a), (s))
#define DIRECT_MMAP_DEFAULT(s)      win32direct_mmap(s)
#endif /* WIN32 */
#endif /* HAVE_MMAP */

#if HAVE_MREMAP
#ifndef WIN32
#define MREMAP_DEFAULT(addr, osz, nsz, mv) mremap((addr), (osz), (nsz), (mv))
#endif /* WIN32 */
#endif /* HAVE_MREMAP */

/**
 * Define CALL_MORECORE
 */
#if HAVE_MORECORE
    #ifdef MORECORE
        #define CALL_MORECORE(S)    MORECORE(S)
    #else  /* MORECORE */
        #define CALL_MORECORE(S)    MORECORE_DEFAULT(S)
    #endif /* MORECORE */
#else  /* HAVE_MORECORE */
    #define CALL_MORECORE(S)        MFAIL
#endif /* HAVE_MORECORE */

/**
 * Define CALL_MMAP/CALL_MUNMAP/CALL_DIRECT_MMAP
 */
#if HAVE_MMAP
    #define USE_MMAP_BIT            (SIZE_T_ONE)

    #ifdef MMAP
        #define CALL_MMAP(s)        MMAP(s)
    #else /* MMAP */
        #define CALL_MMAP(s)        MMAP_DEFAULT(s)
    #endif /* MMAP */
    #ifdef MUNMAP
        #define CALL_MUNMAP(a, s)   MUNMAP((a), (s))
    #else /* MUNMAP */
        #define CALL_MUNMAP(a, s)   MUNMAP_DEFAULT((a), (s))
    #endif /* MUNMAP */
    #ifdef DIRECT_MMAP
        #define CALL_DIRECT_MMAP(s) DIRECT_MMAP(s)
    #else /* DIRECT_MMAP */
        #define CALL_DIRECT_MMAP(s) DIRECT_MMAP_DEFAULT(s)
    #endif /* DIRECT_MMAP */
#else  /* HAVE_MMAP */
    #define USE_MMAP_BIT            (SIZE_T_ZERO)

    #define MMAP(s)                 MFAIL
    #define MUNMAP(a, s)            (-1)
    #define DIRECT_MMAP(s)          MFAIL
    #define CALL_DIRECT_MMAP(s)     DIRECT_MMAP(s)
    #define CALL_MMAP(s)            MMAP(s)
    #define CALL_MUNMAP(a, s)       MUNMAP((a), (s))
#endif /* HAVE_MMAP */

/**
 * Define CALL_MREMAP
 */
#if HAVE_MMAP && HAVE_MREMAP
    #ifdef MREMAP
        #define CALL_MREMAP(addr, osz, nsz, mv) MREMAP((addr), (osz), (nsz), (mv))
    #else /* MREMAP */
        #define CALL_MREMAP(addr, osz, nsz, mv) MREMAP_DEFAULT((addr), (osz), (nsz), (mv))
    #endif /* MREMAP */
#else  /* HAVE_MMAP && HAVE_MREMAP */
    #define CALL_MREMAP(addr, osz, nsz, mv)     MFAIL
#endif /* HAVE_MMAP && HAVE_MREMAP */

/* mstate bit set if continguous morecore disabled or failed */
#define USE_NONCONTIGUOUS_BIT (4U)

/* segment bit set in create_mspace_with_base */
#define EXTERN_BIT            (8U)


/* --------------------------- Lock preliminaries ------------------------ */

/*
  When locks are defined, there is one global lock, plus
  one per-mspace lock.

  The global lock_ensures that mparams.magic and other unique
  mparams values are initialized only once. It also protects
  sequences of calls to MORECORE.  In many cases sys_alloc requires
  two calls, that should not be interleaved with calls by other
  threads.  This does not protect against direct calls to MORECORE
  by other threads not using this lock, so there is still code to
  cope the best we can on interference.

  Per-mspace locks surround calls to malloc, free, etc.
  By default, locks are simple non-reentrant mutexes.

  Because lock-protected regions generally have bounded times, it is
  OK to use the supplied simple spinlocks. Spinlocks are likely to
  improve performance for lightly contended applications, but worsen
  performance under heavy contention.

  If USE_LOCKS is > 1, the definitions of lock routines here are
  bypassed, in which case you will need to define the type MLOCK_T,
  and at least INITIAL_LOCK, DESTROY_LOCK, ACQUIRE_LOCK, RELEASE_LOCK
  and TRY_LOCK.  You must also declare a
    static MLOCK_T malloc_global_mutex = { initialization values };.

*/

#if !USE_LOCKS
#define USE_LOCK_BIT               (0U)
#define INITIAL_LOCK(l)            (0)
#define DESTROY_LOCK(l)            (0)
#define ACQUIRE_MALLOC_GLOBAL_LOCK()
#define RELEASE_MALLOC_GLOBAL_LOCK()

#else
#if USE_LOCKS > 1
/* -----------------------  User-defined locks ------------------------ */
/* Define your own lock implementation here */
/* #define INITIAL_LOCK(lk)  ... */
/* #define DESTROY_LOCK(lk)  ... */
/* #define ACQUIRE_LOCK(lk)  ... */
/* #define RELEASE_LOCK(lk)  ... */
/* #define TRY_LOCK(lk) ... */
/* static MLOCK_T malloc_global_mutex = ... */

#elif USE_SPIN_LOCKS

/* First, define CAS_LOCK and CLEAR_LOCK on ints */
/* Note CAS_LOCK defined to return 0 on success */

#if defined(__GNUC__)&& (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1))
#define CAS_LOCK(sl)     __sync_lock_test_and_set(sl, 1)
#define CLEAR_LOCK(sl)   __sync_lock_release(sl)

#elif (defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)))
/* Custom spin locks for older gcc on x86 */
static FORCEINLINE int x86_cas_lock(int *sl) {
  int ret;
  int val = 1;
  int cmp = 0;
  __asm__ __volatile__  ("lock; cmpxchgl %1, %2"
                         : "=a" (ret)
                         : "r" (val), "m" (*(sl)), "0"(cmp)
                         : "memory", "cc");
  return ret;
}

static FORCEINLINE void x86_clear_lock(int* sl) {
  assert(*sl != 0);
  int prev = 0;
  int ret;
  __asm__ __volatile__ ("lock; xchgl %0, %1"
                        : "=r" (ret)
                        : "m" (*(sl)), "0"(prev)
                        : "memory");
}

#define CAS_LOCK(sl)     x86_cas_lock(sl)
#define CLEAR_LOCK(sl)   x86_clear_lock(sl)

#else /* Win32 MSC */
#define CAS_LOCK(sl)     interlockedexchange(sl, (LONG)1)
#define CLEAR_LOCK(sl)   interlockedexchange (sl, (LONG)0)

#endif /* ... gcc spins locks ... */

/* How to yield for a spin lock */
#define SPINS_PER_YIELD       63
#if defined(_MSC_VER)
#define SLEEP_EX_DURATION     50 /* delay for yield/sleep */
#define SPIN_LOCK_YIELD  SleepEx(SLEEP_EX_DURATION, FALSE)
#elif defined (__SVR4) && defined (__sun) /* solaris */
#define SPIN_LOCK_YIELD   thr_yield();
#elif !defined(LACKS_SCHED_H)
#define SPIN_LOCK_YIELD   sched_yield();
#else
#define SPIN_LOCK_YIELD
#endif /* ... yield ... */

#if !defined(USE_RECURSIVE_LOCKS) || USE_RECURSIVE_LOCKS == 0
/* Plain spin locks use single word (embedded in malloc_states) */
static int spin_acquire_lock(int *sl) {
  int spins = 0;
  while (*(volatile int *)sl != 0 || CAS_LOCK(sl)) {
    if ((++spins & SPINS_PER_YIELD) == 0) {
      SPIN_LOCK_YIELD;
    }
  }
  return 0;
}

#define MLOCK_T               int
#define TRY_LOCK(sl)          !CAS_LOCK(sl)
#define RELEASE_LOCK(sl)      CLEAR_LOCK(sl)
#define ACQUIRE_LOCK(sl)      (CAS_LOCK(sl)? spin_acquire_lock(sl) : 0)
#define INITIAL_LOCK(sl)      (*sl = 0)
#define DESTROY_LOCK(sl)      (0)
static MLOCK_T malloc_global_mutex = 0;

#else /* USE_RECURSIVE_LOCKS */
/* types for lock owners */
#ifdef WIN32
#define THREAD_ID_T           DWORD
#define CURRENT_THREAD        GetCurrentThreadId()
#define EQ_OWNER(X,Y)         ((X) == (Y))
#else
/*
  Note: the following assume that pthread_t is a type that can be
  initialized to (casted) zero. If this is not the case, you will need to
  somehow redefine these or not use spin locks.
*/
#define THREAD_ID_T           pthread_t
#define CURRENT_THREAD        pthread_self()
#define EQ_OWNER(X,Y)         pthread_equal(X, Y)
#endif

struct malloc_recursive_lock {
  int sl;
  unsigned int c;
  THREAD_ID_T threadid;
};

#define MLOCK_T  struct malloc_recursive_lock
static MLOCK_T malloc_global_mutex = { 0, 0, (THREAD_ID_T)0};

static FORCEINLINE void recursive_release_lock(MLOCK_T *lk) {
  assert(lk->sl != 0);
  if (--lk->c == 0) {
    CLEAR_LOCK(&lk->sl);
  }
}

static FORCEINLINE int recursive_acquire_lock(MLOCK_T *lk) {
  THREAD_ID_T mythreadid = CURRENT_THREAD;
  int spins = 0;
  for (;;) {
    if (*((volatile int *)(&lk->sl)) == 0) {
      if (!CAS_LOCK(&lk->sl)) {
        lk->threadid = mythreadid;
        lk->c = 1;
        return 0;
      }
    }
    else if (EQ_OWNER(lk->threadid, mythreadid)) {
      ++lk->c;
      return 0;
    }
    if ((++spins & SPINS_PER_YIELD) == 0) {
      SPIN_LOCK_YIELD;
    }
  }
}

static FORCEINLINE int recursive_try_lock(MLOCK_T *lk) {
  THREAD_ID_T mythreadid = CURRENT_THREAD;
  if (*((volatile int *)(&lk->sl)) == 0) {
    if (!CAS_LOCK(&lk->sl)) {
      lk->threadid = mythreadid;
      lk->c = 1;
      return 1;
    }
  }
  else if (EQ_OWNER(lk->threadid, mythreadid)) {
    ++lk->c;
    return 1;
  }
  return 0;
}

#define RELEASE_LOCK(lk)      recursive_release_lock(lk)
#define TRY_LOCK(lk)          recursive_try_lock(lk)
#define ACQUIRE_LOCK(lk)      recursive_acquire_lock(lk)
#define INITIAL_LOCK(lk)      ((lk)->threadid = (THREAD_ID_T)0, (lk)->sl = 0, (lk)->c = 0)
#define DESTROY_LOCK(lk)      (0)
#endif /* USE_RECURSIVE_LOCKS */

#elif defined(WIN32) /* Win32 critical sections */
#define MLOCK_T               CRITICAL_SECTION
#define ACQUIRE_LOCK(lk)      (EnterCriticalSection(lk), 0)
#define RELEASE_LOCK(lk)      LeaveCriticalSection(lk)
#define TRY_LOCK(lk)          TryEnterCriticalSection(lk)
#define INITIAL_LOCK(lk)      (!InitializeCriticalSectionAndSpinCount((lk), 0x80000000|4000))
#define DESTROY_LOCK(lk)      (DeleteCriticalSection(lk), 0)
#define NEED_GLOBAL_LOCK_INIT

static MLOCK_T malloc_global_mutex;
static volatile LONG malloc_global_mutex_status;

/* Use spin loop to initialize global lock */
static void init_malloc_global_mutex() {
  for (;;) {
    long stat = malloc_global_mutex_status;
    if (stat > 0)
      return;
    /* transition to < 0 while initializing, then to > 0) */
    if (stat == 0 &&
        interlockedcompareexchange(&malloc_global_mutex_status, (LONG)-1, (LONG)0) == 0) {
      InitializeCriticalSection(&malloc_global_mutex);
      interlockedexchange(&malloc_global_mutex_status, (LONG)1);
      return;
    }
    SleepEx(0, FALSE);
  }
}

#else /* pthreads-based locks */
#define MLOCK_T               pthread_mutex_t
#define ACQUIRE_LOCK(lk)      pthread_mutex_lock(lk)
#define RELEASE_LOCK(lk)      pthread_mutex_unlock(lk)
#define TRY_LOCK(lk)          (!pthread_mutex_trylock(lk))
#define INITIAL_LOCK(lk)      pthread_init_lock(lk)
#define DESTROY_LOCK(lk)      pthread_mutex_destroy(lk)

#if defined(USE_RECURSIVE_LOCKS) && USE_RECURSIVE_LOCKS != 0 && defined(linux) && !defined(PTHREAD_MUTEX_RECURSIVE)
/* Cope with old-style linux recursive lock initialization by adding */
/* skipped internal declaration from pthread.h */
extern int pthread_mutexattr_setkind_np __P ((pthread_mutexattr_t *__attr,
                                              int __kind));
#define PTHREAD_MUTEX_RECURSIVE PTHREAD_MUTEX_RECURSIVE_NP
#define pthread_mutexattr_settype(x,y) pthread_mutexattr_setkind_np(x,y)
#endif /* USE_RECURSIVE_LOCKS ... */

static MLOCK_T malloc_global_mutex = PTHREAD_MUTEX_INITIALIZER;

static int pthread_init_lock (MLOCK_T *lk) {
  pthread_mutexattr_t attr;
  if (pthread_mutexattr_init(&attr)) return 1;
#if defined(USE_RECURSIVE_LOCKS) && USE_RECURSIVE_LOCKS != 0
  if (pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE)) return 1;
#endif
  if (pthread_mutex_init(lk, &attr)) return 1;
  if (pthread_mutexattr_destroy(&attr)) return 1;
  return 0;
}

#endif /* ... lock types ... */

/* Common code for all lock types */
#define USE_LOCK_BIT               (2U)

#ifndef ACQUIRE_MALLOC_GLOBAL_LOCK
#define ACQUIRE_MALLOC_GLOBAL_LOCK()  ACQUIRE_LOCK(&malloc_global_mutex);
#endif

#ifndef RELEASE_MALLOC_GLOBAL_LOCK
#define RELEASE_MALLOC_GLOBAL_LOCK()  RELEASE_LOCK(&malloc_global_mutex);
#endif

#endif /* USE_LOCKS */

/* -----------------------  Chunk representations ------------------------ */

/*
  (The following includes lightly edited explanations by Colin Plumb.)

  The malloc_chunk declaration below is misleading (but accurate and
  necessary).  It declares a "view" into memory allowing access to
  necessary fields at known offsets from a given base.

  Chunks of memory are maintained using a `boundary tag' method as
  originally described by Knuth.  (See the paper by Paul Wilson
  ftp://ftp.cs.utexas.edu/pub/garbage/allocsrv.ps for a survey of such
  techniques.)  Sizes of free chunks are stored both in the front of
  each chunk and at the end.  This makes consolidating fragmented
  chunks into bigger chunks fast.  The head fields also hold bits
  representing whether chunks are free or in use.

  Here are some pictures to make it clearer.  They are "exploded" to
  show that the state of a chunk can be thought of as extending from
  the high 31 bits of the head field of its header through the
  prev_foot and PINUSE_BIT bit of the following chunk header.

  A chunk that's in use looks like:

   chunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
           | Size of previous chunk (if P = 0)                             |
           +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
         +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |P|
         | Size of this chunk                                         1| +-+
   mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
         |                                                               |
         +-                                                             -+
         |                                                               |
         +-                                                             -+
         |                                                               :
         +-      size - sizeof(size_t) available payload bytes          -+
         :                                                               |
 chunk-> +-                                                             -+
         |                                                               |
         +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |1|
       | Size of next chunk (may or may not be in use)               | +-+
 mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+

    And if it's free, it looks like this:

   chunk-> +-                                                             -+
           | User payload (must be in use, or we would have merged!)       |
           +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
         +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |P|
         | Size of this chunk                                         0| +-+
   mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
         | Next pointer                                                  |
         +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
         | Prev pointer                                                  |
         +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
         |                                                               :
         +-      size - sizeof(struct chunk) unused bytes               -+
         :                                                               |
 chunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
         | Size of this chunk                                            |
         +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |0|
       | Size of next chunk (must be in use, or we would have merged)| +-+
 mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
       |                                                               :
       +- User payload                                                -+
       :                                                               |
       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
                                                                     |0|
                                                                     +-+
  Note that since we always merge adjacent free chunks, the chunks
  adjacent to a free chunk must be in use.

  Given a pointer to a chunk (which can be derived trivially from the
  payload pointer) we can, in O(1) time, find out whether the adjacent
  chunks are free, and if so, unlink them from the lists that they
  are on and merge them with the current chunk.

  Chunks always begin on even word boundaries, so the mem portion
  (which is returned to the user) is also on an even word boundary, and
  thus at least double-word aligned.

  The P (PINUSE_BIT) bit, stored in the unused low-order bit of the
  chunk size (which is always a multiple of two words), is an in-use
  bit for the *previous* chunk.  If that bit is *clear*, then the
  word before the current chunk size contains the previous chunk
  size, and can be used to find the front of the previous chunk.
  The very first chunk allocated always has this bit set, preventing
  access to non-existent (or non-owned) memory. If pinuse is set for
  any given chunk, then you CANNOT determine the size of the
  previous chunk, and might even get a memory addressing fault when
  trying to do so.

  The C (CINUSE_BIT) bit, stored in the unused second-lowest bit of
  the chunk size redundantly records whether the current chunk is
  inuse (unless the chunk is mmapped). This redundancy enables usage
  checks within free and realloc, and reduces indirection when freeing
  and consolidating chunks.

  Each freshly allocated chunk must have both cinuse and pinuse set.
  That is, each allocated chunk borders either a previously allocated
  and still in-use chunk, or the base of its memory arena. This is
  ensured by making all allocations from the `lowest' part of any
  found chunk.  Further, no free chunk physically borders another one,
  so each free chunk is known to be preceded and followed by either
  inuse chunks or the ends of memory.

  Note that the `foot' of the current chunk is actually represented
  as the prev_foot of the NEXT chunk. This makes it easier to
  deal with alignments etc but can be very confusing when trying
  to extend or adapt this code.

  The exceptions to all this are

     1. The special chunk `top' is the top-most available chunk (i.e.,
        the one bordering the end of available memory). It is treated
        specially.  Top is never included in any bin, is used only if
        no other chunk is available, and is released back to the
        system if it is very large (see M_TRIM_THRESHOLD).  In effect,
        the top chunk is treated as larger (and thus less well
        fitting) than any other available chunk.  The top chunk
        doesn't update its trailing size field since there is no next
        contiguous chunk that would have to index off it. However,
        space is still allocated for it (TOP_FOOT_SIZE) to enable
        separation or merging when space is extended.

     3. Chunks allocated via mmap, have both cinuse and pinuse bits
        cleared in their head fields.  Because they are allocated
        one-by-one, each must carry its own prev_foot field, which is
        also used to hold the offset this chunk has within its mmapped
        region, which is needed to preserve alignment. Each mmapped
        chunk is trailed by the first two fields of a fake next-chunk
        for sake of usage checks.

*/

struct malloc_chunk {
  persist<size_t>               prev_foot;  /* Size of previous chunk (if free).  */
  persist<size_t>               head;       /* Size and inuse bits. */
  persist<struct malloc_chunk*> fd;         /* double links -- used only if free. */
  persist<struct malloc_chunk*> bk;
};

typedef struct malloc_chunk  mchunk;
typedef struct malloc_chunk* mchunkptr;
typedef struct malloc_chunk* sbinptr;  /* The type of bins of chunks */
typedef unsigned int bindex_t;         /* Described below */
typedef unsigned int binmap_t;         /* Described below */
typedef unsigned int flag_t;           /* The type of various bit flag sets */

/* ------------------- Chunks sizes and alignments ----------------------- */

#define MCHUNK_SIZE         (sizeof(mchunk))

#if FOOTERS
#define CHUNK_OVERHEAD      (TWO_SIZE_T_SIZES)
#else /* FOOTERS */
#define CHUNK_OVERHEAD      (SIZE_T_SIZE)
#endif /* FOOTERS */

/* MMapped chunks need a second word of overhead ... */
#define MMAP_CHUNK_OVERHEAD (TWO_SIZE_T_SIZES)
/* ... and additional padding for fake next-chunk at foot */
#define MMAP_FOOT_PAD       (FOUR_SIZE_T_SIZES)

/* The smallest size we can malloc is an aligned minimal chunk */
#define MIN_CHUNK_SIZE\
  ((MCHUNK_SIZE + CHUNK_ALIGN_MASK) & ~CHUNK_ALIGN_MASK)

/* conversion from malloc headers to user pointers, and back */
#define chunk2mem(p)        ((void*)((char*)(p)       + TWO_SIZE_T_SIZES))
#define mem2chunk(mem)      ((mchunkptr)((char*)(mem) - TWO_SIZE_T_SIZES))
/* chunk associated with aligned address A */
#define align_as_chunk(A)   (mchunkptr)((A) + align_offset(chunk2mem(A)))

/* Bounds on request (not chunk) sizes. */
#define MAX_REQUEST         ((-MIN_CHUNK_SIZE) << 2)
#define MIN_REQUEST         (MIN_CHUNK_SIZE - CHUNK_OVERHEAD - SIZE_T_ONE)

/* pad request bytes into a usable size */
#define pad_request(req) \
   (((req) + CHUNK_OVERHEAD + CHUNK_ALIGN_MASK) & ~CHUNK_ALIGN_MASK)

/* pad request, checking for minimum (but not maximum) */
#define request2size(req) \
  (((req) < MIN_REQUEST)? MIN_CHUNK_SIZE : pad_request(req))


/* ------------------ Operations on head and foot fields ----------------- */

/*
  The head field of a chunk is or'ed with PINUSE_BIT when previous
  adjacent chunk in use, and or'ed with CINUSE_BIT if this chunk is in
  use, unless mmapped, in which case both bits are cleared.

  FLAG4_BIT is not used by this malloc, but might be useful in extensions.
*/

#define PINUSE_BIT          (SIZE_T_ONE)
#define CINUSE_BIT          (SIZE_T_TWO)
#define FLAG4_BIT           (SIZE_T_FOUR)
#define INUSE_BITS          (PINUSE_BIT|CINUSE_BIT)
#define FLAG_BITS           (PINUSE_BIT|CINUSE_BIT|FLAG4_BIT)

/* Head value for fenceposts */
#define FENCEPOST_HEAD      (INUSE_BITS|SIZE_T_SIZE)

/* extraction of fields from head words */
#define cinuse(p)           ((p)->head & CINUSE_BIT)
#define pinuse(p)           ((p)->head & PINUSE_BIT)
#define flag4inuse(p)       ((p)->head & FLAG4_BIT)
#define is_inuse(p)         (((p)->head & INUSE_BITS) != PINUSE_BIT)
#define is_mmapped(p)       (((p)->head & INUSE_BITS) == 0)

#define chunksize(p)        ((p)->head & ~(FLAG_BITS))

#define clear_pinuse(p)     ((p)->head &= ~PINUSE_BIT)
#define set_flag4(p)        ((p)->head |= FLAG4_BIT)
#define clear_flag4(p)      ((p)->head &= ~FLAG4_BIT)

/* Treat space at ptr +/- offset as a chunk */
#define chunk_plus_offset(p, s)  ((mchunkptr)(((char*)(p)) + (s)))
#define chunk_minus_offset(p, s) ((mchunkptr)(((char*)(p)) - (s)))

/* Ptr to next or previous physical malloc_chunk. */
#define next_chunk(p) ((mchunkptr)( ((char*)(p)) + ((p)->head & ~FLAG_BITS)))
#define prev_chunk(p) ((mchunkptr)( ((char*)(p)) - ((p)->prev_foot) ))

/* extract next chunk's pinuse bit */
#define next_pinuse(p)  ((next_chunk(p)->head) & PINUSE_BIT)

/* Get/set size at footer */
#define get_foot(p, s)  (((mchunkptr)((char*)(p) + (s)))->prev_foot)
#define set_foot(p, s)  (((mchunkptr)((char*)(p) + (s)))->prev_foot = (s))

/* Set size, pinuse bit, and foot */
#define set_size_and_pinuse_of_free_chunk(p, s)\
  ((p)->head = (s|PINUSE_BIT), set_foot(p, s))

/* Set size, pinuse bit, foot, and clear next pinuse */
#define set_free_with_pinuse(p, s, n)\
  (clear_pinuse(n), set_size_and_pinuse_of_free_chunk(p, s))

/* Get the internal overhead associated with chunk p */
#define overhead_for(p)\
 (is_mmapped(p)? MMAP_CHUNK_OVERHEAD : CHUNK_OVERHEAD)

/* Return true if malloced space is not necessarily cleared */
#if MMAP_CLEARS
#define calloc_must_clear(p) (!is_mmapped(p))
#else /* MMAP_CLEARS */
#define calloc_must_clear(p) (1)
#endif /* MMAP_CLEARS */

/* ---------------------- Overlaid data structures ----------------------- */

/*
  When chunks are not in use, they are treated as nodes of either
  lists or trees.

  "Small"  chunks are stored in circular doubly-linked lists, and look
  like this:

    chunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
            |             Size of previous chunk                            |
            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
    `head:' |             Size of chunk, in bytes                         |P|
      mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
            |             Forward pointer to next chunk in list             |
            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
            |             Back pointer to previous chunk in list            |
            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
            |             Unused space (may be 0 bytes long)                .
            .                                                               .
            .                                                               |
nextchunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
    `foot:' |             Size of chunk, in bytes                           |
            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+

  Larger chunks are kept in a form of bitwise digital trees (aka
  tries) keyed on chunksizes.  Because malloc_tree_chunks are only for
  free chunks greater than 256 bytes, their size doesn't impose any
  constraints on user chunk sizes.  Each node looks like:

    chunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
            |             Size of previous chunk                            |
            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
    `head:' |             Size of chunk, in bytes                         |P|
      mem-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
            |             Forward pointer to next chunk of same size        |
            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
            |             Back pointer to previous chunk of same size       |
            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
            |             Pointer to left child (child[0])                  |
            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
            |             Pointer to right child (child[1])                 |
            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
            |             Pointer to parent                                 |
            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
            |             bin index of this chunk                           |
            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
            |             Unused space                                      .
            .                                                               |
nextchunk-> +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
    `foot:' |             Size of chunk, in bytes                           |
            +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+

  Each tree holding treenodes is a tree of unique chunk sizes.  Chunks
  of the same size are arranged in a circularly-linked list, with only
  the oldest chunk (the next to be used, in our FIFO ordering)
  actually in the tree.  (Tree members are distinguished by a non-null
  parent pointer.)  If a chunk with the same size an an existing node
  is inserted, it is linked off the existing node using pointers that
  work in the same way as fd/bk pointers of small chunks.

  Each tree contains a power of 2 sized range of chunk sizes (the
  smallest is 0x100 <= x < 0x180), which is is divided in half at each
  tree level, with the chunks in the smaller half of the range (0x100
  <= x < 0x140 for the top nose) in the left subtree and the larger
  half (0x140 <= x < 0x180) in the right subtree.  This is, of course,
  done by inspecting individual bits.

  Using these rules, each node's left subtree contains all smaller
  sizes than its right subtree.  However, the node at the root of each
  subtree has no particular ordering relationship to either.  (The
  dividing line between the subtree sizes is based on trie relation.)
  If we remove the last chunk of a given size from the interior of the
  tree, we need to replace it with a leaf node.  The tree ordering
  rules permit a node to be replaced by any leaf below it.

  The smallest chunk in a tree (a common operation in a best-fit
  allocator) can be found by walking a path to the leftmost leaf in
  the tree.  Unlike a usual binary tree, where we follow left child
  pointers until we reach a null, here we follow the right child
  pointer any time the left one is null, until we reach a leaf with
  both child pointers null. The smallest chunk in the tree will be
  somewhere along that path.

  The worst case number of steps to add, find, or remove a node is
  bounded by the number of bits differentiating chunks within
  bins. Under current bin calculations, this ranges from 6 up to 21
  (for 32 bit sizes) or up to 53 (for 64 bit sizes). The typical case
  is of course much better.
*/

struct malloc_tree_chunk {
  /* The first four fields must be compatible with malloc_chunk */
  persist<size_t>                    prev_foot;
  persist<size_t>                    head;
  persist<struct malloc_tree_chunk*> fd;
  persist<struct malloc_tree_chunk*> bk;

  persist<struct malloc_tree_chunk*> child[2];
  persist<struct malloc_tree_chunk*> parent;
  persist<bindex_t>                  index;
};

typedef struct malloc_tree_chunk  tchunk;
typedef struct malloc_tree_chunk* tchunkptr;
typedef struct malloc_tree_chunk* tbinptr; /* The type of bins of trees */

/* A little helper macro for trees */
#define leftmost_child(t) ((t)->child[0] != 0? (t)->child[0] : (t)->child[1])

/* ----------------------------- Segments -------------------------------- */

/*
  Each malloc space may include non-contiguous segments, held in a
  list headed by an embedded malloc_segment record representing the
  top-most space. Segments also include flags holding properties of
  the space. Large chunks that are directly allocated by mmap are not
  included in this list. They are instead independently created and
  destroyed without otherwise keeping track of them.

  Segment management mainly comes into play for spaces allocated by
  MMAP.  Any call to MMAP might or might not return memory that is
  adjacent to an existing segment.  MORECORE normally contiguously
  extends the current space, so this space is almost always adjacent,
  which is simpler and faster to deal with. (This is why MORECORE is
  used preferentially to MMAP when both are available -- see
  sys_alloc.)  When allocating using MMAP, we don't use any of the
  hinting mechanisms (inconsistently) supported in various
  implementations of unix mmap, or distinguish reserving from
  committing memory. Instead, we just ask for space, and exploit
  contiguity when we get it.  It is probably possible to do
  better than this on some systems, but no general scheme seems
  to be significantly better.

  Management entails a simpler variant of the consolidation scheme
  used for chunks to reduce fragmentation -- new adjacent memory is
  normally prepended or appended to an existing segment. However,
  there are limitations compared to chunk consolidation that mostly
  reflect the fact that segment processing is relatively infrequent
  (occurring only when getting memory from system) and that we
  don't expect to have huge numbers of segments:

  * Segments are not indexed, so traversal requires linear scans.  (It
    would be possible to index these, but is not worth the extra
    overhead and complexity for most programs on most platforms.)
  * New segments are only appended to old ones when holding top-most
    memory; if they cannot be prepended to others, they are held in
    different segments.

  Except for the top-most segment of an mstate, each segment record
  is kept at the tail of its segment. Segments are added by pushing
  segment records onto the list headed by &mstate.seg for the
  containing mstate.

  Segment flags control allocation/merge/deallocation policies:
  * If EXTERN_BIT set, then we did not allocate this segment,
    and so should not try to deallocate or merge with others.
    (This currently holds only for the initial segment passed
    into create_mspace_with_base.)
  * If USE_MMAP_BIT set, the segment may be merged with
    other surrounding mmapped segments and trimmed/de-allocated
    using munmap.
  * If neither bit is set, then the segment was obtained using
    MORECORE so can be merged with surrounding MORECORE'd segments
    and deallocated/trimmed using MORECORE with negative arguments.
*/

struct malloc_segment {
  persist<char*>       base;             /* base address */
  persist<size_t>       size;             /* allocated size */
  persist<struct malloc_segment*> next;   /* ptr to next segment */
  persist<flag_t>       sflags;           /* mmap and extern flag */
};

#define is_mmapped_segment(S)  ((S)->sflags & USE_MMAP_BIT)
#define is_extern_segment(S)   ((S)->sflags & EXTERN_BIT)

typedef struct malloc_segment  msegment;
typedef struct malloc_segment* msegmentptr;

/* ---------------------------- malloc_state ----------------------------- */

/*
   A malloc_state holds all of the bookkeeping for a space.
   The main fields are:

  Top
    The topmost chunk of the currently active segment. Its size is
    cached in topsize.  The actual size of topmost space is
    topsize+TOP_FOOT_SIZE, which includes space reserved for adding
    fenceposts and segment records if necessary when getting more
    space from the system.  The size at which to autotrim top is
    cached from mparams in trim_check, except that it is disabled if
    an autotrim fails.

  Designated victim (dv)
    This is the preferred chunk for servicing small requests that
    don't have exact fits.  It is normally the chunk split off most
    recently to service another small request.  Its size is cached in
    dvsize. The link fields of this chunk are not maintained since it
    is not kept in a bin.

  SmallBins
    An array of bin headers for free chunks.  These bins hold chunks
    with sizes less than MIN_LARGE_SIZE bytes. Each bin contains
    chunks of all the same size, spaced 8 bytes apart.  To simplify
    use in double-linked lists, each bin header acts as a malloc_chunk
    pointing to the real first node, if it exists (else pointing to
    itself).  This avoids special-casing for headers.  But to avoid
    waste, we allocate only the fd/bk pointers of bins, and then use
    repositioning tricks to treat these as the fields of a chunk.

  TreeBins
    Treebins are pointers to the roots of trees holding a range of
    sizes. There are 2 equally spaced treebins for each power of two
    from TREE_SHIFT to TREE_SHIFT+16. The last bin holds anything
    larger.

  Bin maps
    There is one bit map for small bins ("smallmap") and one for
    treebins ("treemap).  Each bin sets its bit when non-empty, and
    clears the bit when empty.  Bit operations are then used to avoid
    bin-by-bin searching -- nearly all "search" is done without ever
    looking at bins that won't be selected.  The bit maps
    conservatively use 32 bits per map word, even if on 64bit system.
    For a good description of some of the bit-based techniques used
    here, see Henry S. Warren Jr's book "Hacker's Delight" (and
    supplement at http://hackersdelight.org/). Many of these are
    intended to reduce the branchiness of paths through malloc etc, as
    well as to reduce the number of memory locations read or written.

  Segments
    A list of segments headed by an embedded malloc_segment record
    representing the initial space.

  Address check support
    The least_addr field is the least address ever obtained from
    MORECORE or MMAP. Attempted frees and reallocs of any address less
    than this are trapped (unless INSECURE is defined).

  Magic tag
    A cross-check field that should always hold same value as mparams.magic.

  Max allowed footprint
    The maximum allowed bytes to allocate from system (zero means no limit)

  Flags
    Bits recording whether to use MMAP, locks, or contiguous MORECORE

  Statistics
    Each space keeps track of current and maximum system memory
    obtained via MORECORE or MMAP.

  Trim support
    Fields holding the amount of unused topmost memory that should trigger
    trimming, and a counter to force periodic scanning to release unused
    non-topmost segments.

  Locking
    If USE_LOCKS is defined, the "mutex" lock is acquired and released
    around every public call using this mspace.

  Extension support
    A void* pointer and a size_t field that can be used to help implement
    extensions to this malloc.
*/

/* Bin types, widths and sizes */
#define NSMALLBINS        (32U)
#define NTREEBINS         (32U)
#define SMALLBIN_SHIFT    (3U)
#define SMALLBIN_WIDTH    (SIZE_T_ONE << SMALLBIN_SHIFT)
#define TREEBIN_SHIFT     (8U)
#define MIN_LARGE_SIZE    (SIZE_T_ONE << TREEBIN_SHIFT)
#define MAX_SMALL_SIZE    (MIN_LARGE_SIZE - SIZE_T_ONE)
#define MAX_SMALL_REQUEST (MAX_SMALL_SIZE - CHUNK_ALIGN_MASK - CHUNK_OVERHEAD)

struct malloc_state {
  persist<binmap_t>   smallmap;
  persist<binmap_t>   treemap;
  persist<size_t>     dvsize;
  persist<size_t>     topsize;
  persist<char*>      least_addr;
  persist<mchunkptr>  dv;
  persist<mchunkptr>  top;
  persist<size_t>     trim_check;
  persist<size_t>     release_checks;
  persist<size_t>     magic;
  persist<mchunkptr>  smallbins[(NSMALLBINS+1)*2];
  persist<tbinptr>    treebins[NTREEBINS];
  persist<size_t>     footprint;
  persist<size_t>     max_footprint;
  persist<size_t>     footprint_limit; /* zero means no limit */
  persist<flag_t>     mflags;
#if USE_LOCKS
  MLOCK_T    mutex;     /* locate lock among fields that rarely change */
#endif /* USE_LOCKS */
  msegment   seg;
  persist<void*>      extp;      /* Unused but available for extensions */
  persist<size_t>     exts;
};

typedef struct malloc_state*    mstate;

/* ------------- Global malloc_state and malloc_params ------------------- */

/*
  malloc_params holds global properties, including those that can be
  dynamically set using mallopt. There is a single instance, mparams,
  initialized in init_mparams. Note that the non-zeroness of "magic"
  also serves as an initialization flag.
*/

struct malloc_params {
  size_t magic;
  size_t page_size;
  size_t granularity;
  size_t mmap_threshold;
  size_t trim_threshold;
  flag_t default_mflags;
};

static struct malloc_params mparams;

/* Ensure mparams initialized */
#define ensure_initialization() (void)(mparams.magic != 0 || init_mparams())

#if !ONLY_MSPACES

/* The global malloc_state used for all non-"mspace" calls */
static struct malloc_state _gm_;
#define gm                 (&_gm_)
#define is_global(M)       ((M) == &_gm_)

#endif /* !ONLY_MSPACES */

#define is_initialized(M)  ((M)->top != 0)

/* -------------------------- system alloc setup ------------------------- */

/* Operations on mflags */

#define use_lock(M)           ((M)->mflags &   USE_LOCK_BIT)
#define enable_lock(M)        ((M)->mflags |=  USE_LOCK_BIT)
#if USE_LOCKS
#define disable_lock(M)       ((M)->mflags &= ~USE_LOCK_BIT)
#else
#define disable_lock(M)
#endif

#define use_mmap(M)           ((M)->mflags &   (flag_t)USE_MMAP_BIT)
#define enable_mmap(M)        ((M)->mflags |=  (flag_t)USE_MMAP_BIT)
#if HAVE_MMAP
#define disable_mmap(M)       ((M)->mflags &=  (flag_t)~USE_MMAP_BIT)
#else
#define disable_mmap(M)
#endif

#define use_noncontiguous(M)  ((M)->mflags &   USE_NONCONTIGUOUS_BIT)
#define disable_contiguous(M) ((M)->mflags |=  USE_NONCONTIGUOUS_BIT)

#define set_lock(M,L)\
 ((M)->mflags = (L)?\
  ((M)->mflags | USE_LOCK_BIT) :\
  ((M)->mflags & ~USE_LOCK_BIT))

/* page-align a size */
#define page_align(S)\
 (((S) + (mparams.page_size - SIZE_T_ONE)) & ~(mparams.page_size - SIZE_T_ONE))

/* granularity-align a size */
#define granularity_align(S)\
  (((S) + (mparams.granularity - SIZE_T_ONE))\
   & ~(mparams.granularity - SIZE_T_ONE))


/* For mmap, use granularity alignment on windows, else page-align */
#ifdef WIN32
#define mmap_align(S) granularity_align(S)
#else
#define mmap_align(S) page_align(S)
#endif

/* For sys_alloc, enough padding to ensure can malloc request on success */
#define SYS_ALLOC_PADDING (TOP_FOOT_SIZE + MALLOC_ALIGNMENT)

#define is_page_aligned(S)\
   (((size_t)(S) & (mparams.page_size - SIZE_T_ONE)) == 0)
#define is_granularity_aligned(S)\
   (((size_t)(S) & (mparams.granularity - SIZE_T_ONE)) == 0)

/*  True if segment S holds address A */
#define segment_holds(S, A)\
  ((char*)(A) >= S->base && (char*)(A) < S->base + S->size)

/* Return segment holding given address */
static msegmentptr segment_holding(mstate m, char* addr) {
  msegmentptr sp = &m->seg;
  for (;;) {
    if (addr >= sp->base && addr < sp->base + sp->size)
      return sp;
    if ((sp = sp->next) == 0)
      return 0;
  }
}

/* Return true if segment contains a segment link */
static int has_segment_link(mstate m, msegmentptr ss) {
  msegmentptr sp = &m->seg;
  for (;;) {
    if ((char*)sp >= ss->base && (char*)sp < ss->base + ss->size)
      return 1;
    if ((sp = sp->next) == 0)
      return 0;
  }
}

#ifndef MORECORE_CANNOT_TRIM
#define should_trim(M,s)  ((s) > (M)->trim_check)
#else  /* MORECORE_CANNOT_TRIM */
#define should_trim(M,s)  (0)
#endif /* MORECORE_CANNOT_TRIM */

/*
  TOP_FOOT_SIZE is padding at the end of a segment, including space
  that may be needed to place segment records and fenceposts when new
  noncontiguous segments are added.
*/
#define TOP_FOOT_SIZE\
  (align_offset(chunk2mem(0))+pad_request(sizeof(struct malloc_segment))+MIN_CHUNK_SIZE)


/* -------------------------------  Hooks -------------------------------- */

/*
  PREACTION should be defined to return 0 on success, and nonzero on
  failure. If you are not using locking, you can redefine these to do
  anything you like.
*/

#if USE_LOCKS
#define PREACTION(M)  ((use_lock(M))? ACQUIRE_LOCK(&(M)->mutex) : 0)
#define POSTACTION(M) { if (use_lock(M)) RELEASE_LOCK(&(M)->mutex); }
#else /* USE_LOCKS */

#ifndef PREACTION
#define PREACTION(M) (0)
#endif  /* PREACTION */

#ifndef POSTACTION
#define POSTACTION(M)
#endif  /* POSTACTION */

#endif /* USE_LOCKS */

/*
  CORRUPTION_ERROR_ACTION is triggered upon detected bad addresses.
  USAGE_ERROR_ACTION is triggered on detected bad frees and
  reallocs. The argument p is an address that might have triggered the
  fault. It is ignored by the two predefined actions, but might be
  useful in custom actions that try to help diagnose errors.
*/

#if PROCEED_ON_ERROR

/* A count of the number of corruption errors causing resets */
int malloc_corruption_error_count;

/* default corruption action */
static void reset_on_error(mstate m);

#define CORRUPTION_ERROR_ACTION(m)  reset_on_error(m)
#define USAGE_ERROR_ACTION(m, p)

#else /* PROCEED_ON_ERROR */

#ifndef CORRUPTION_ERROR_ACTION
#define CORRUPTION_ERROR_ACTION(m) ABORT
#endif /* CORRUPTION_ERROR_ACTION */

#ifndef USAGE_ERROR_ACTION
#define USAGE_ERROR_ACTION(m,p) ABORT
#endif /* USAGE_ERROR_ACTION */

#endif /* PROCEED_ON_ERROR */


/* -------------------------- Debugging setup ---------------------------- */

#if ! DEBUG

#define check_free_chunk(M,P)
#define check_inuse_chunk(M,P)
#define check_malloced_chunk(M,P,N)
#define check_mmapped_chunk(M,P)
#define check_malloc_state(M)
#define check_top_chunk(M,P)

#else /* DEBUG */
#define check_free_chunk(M,P)       do_check_free_chunk(M,P)
#define check_inuse_chunk(M,P)      do_check_inuse_chunk(M,P)
#define check_top_chunk(M,P)        do_check_top_chunk(M,P)
#define check_malloced_chunk(M,P,N) do_check_malloced_chunk(M,P,N)
#define check_mmapped_chunk(M,P)    do_check_mmapped_chunk(M,P)
#define check_malloc_state(M)       do_check_malloc_state(M)

static void   do_check_any_chunk(mstate m, mchunkptr p);
static void   do_check_top_chunk(mstate m, mchunkptr p);
static void   do_check_mmapped_chunk(mstate m, mchunkptr p);
static void   do_check_inuse_chunk(mstate m, mchunkptr p);
static void   do_check_free_chunk(mstate m, mchunkptr p);
static void   do_check_malloced_chunk(mstate m, void* mem, size_t s);
static void   do_check_tree(mstate m, tchunkptr t);
static void   do_check_treebin(mstate m, bindex_t i);
static void   do_check_smallbin(mstate m, bindex_t i);
static void   do_check_malloc_state(mstate m);
static int    bin_find(mstate m, mchunkptr x);
static size_t traverse_and_check(mstate m);
#endif /* DEBUG */

/* ---------------------------- Indexing Bins ---------------------------- */

#define is_small(s)         (((s) >> SMALLBIN_SHIFT) < NSMALLBINS)
#define small_index(s)      (bindex_t)((s)  >> SMALLBIN_SHIFT)
#define small_index2size(i) ((i)  << SMALLBIN_SHIFT)
#define MIN_SMALL_INDEX     (small_index(MIN_CHUNK_SIZE))

/* addressing by index. See above about smallbin repositioning */
#define smallbin_at(M, i)   ((sbinptr)((char*)&((M)->smallbins[(i)<<1])))
#define treebin_at(M,i)     ((persist<tbinptr>*)&((M)->treebins[i]))

/* assign tree index for size S to variable I. Use x86 asm if possible  */
#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
#define compute_tree_index(S, I)\
{\
  unsigned int X = S >> TREEBIN_SHIFT;\
  if (X == 0)\
    I = 0;\
  else if (X > 0xFFFF)\
    I = NTREEBINS-1;\
  else {\
    unsigned int K = (unsigned) sizeof(X)*__CHAR_BIT__ - 1 - (unsigned) __builtin_clz(X); \
    I =  (bindex_t)((K << 1) + ((S >> (K + (TREEBIN_SHIFT-1)) & 1)));\
  }\
}

#elif defined (__INTEL_COMPILER)
#define compute_tree_index(S, I)\
{\
  size_t X = S >> TREEBIN_SHIFT;\
  if (X == 0)\
    I = 0;\
  else if (X > 0xFFFF)\
    I = NTREEBINS-1;\
  else {\
    unsigned int K = _bit_scan_reverse (X); \
    I =  (bindex_t)((K << 1) + ((S >> (K + (TREEBIN_SHIFT-1)) & 1)));\
  }\
}

#elif defined(_MSC_VER) && _MSC_VER>=1300
#define compute_tree_index(S, I)\
{\
  size_t X = S >> TREEBIN_SHIFT;\
  if (X == 0)\
    I = 0;\
  else if (X > 0xFFFF)\
    I = NTREEBINS-1;\
  else {\
    unsigned int K;\
    _BitScanReverse((DWORD *) &K, (DWORD) X);\
    I =  (bindex_t)((K << 1) + ((S >> (K + (TREEBIN_SHIFT-1)) & 1)));\
  }\
}

#else /* GNUC */
#define compute_tree_index(S, I)\
{\
  size_t X = S >> TREEBIN_SHIFT;\
  if (X == 0)\
    I = 0;\
  else if (X > 0xFFFF)\
    I = NTREEBINS-1;\
  else {\
    unsigned int Y = (unsigned int)X;\
    unsigned int N = ((Y - 0x100) >> 16) & 8;\
    unsigned int K = (((Y <<= N) - 0x1000) >> 16) & 4;\
    N += K;\
    N += K = (((Y <<= K) - 0x4000) >> 16) & 2;\
    K = 14 - N + ((Y <<= K) >> 15);\
    I = (K << 1) + ((S >> (K + (TREEBIN_SHIFT-1)) & 1));\
  }\
}
#endif /* GNUC */

/* Bit representing maximum resolved size in a treebin at i */
#define bit_for_tree_index(i) \
   (i == NTREEBINS-1)? (SIZE_T_BITSIZE-1) : (((i) >> 1) + TREEBIN_SHIFT - 2)

/* Shift placing maximum resolved bit in a treebin at i as sign bit */
#define leftshift_for_tree_index(i) \
   ((i == NTREEBINS-1)? 0 : \
    ((SIZE_T_BITSIZE-SIZE_T_ONE) - (((i) >> 1) + TREEBIN_SHIFT - 2)))

/* The size of the smallest chunk held in bin with index i */
#define minsize_for_tree_index(i) \
   ((SIZE_T_ONE << (((i) >> 1) + TREEBIN_SHIFT)) |  \
   (((size_t)((i) & SIZE_T_ONE)) << (((i) >> 1) + TREEBIN_SHIFT - 1)))


/* ------------------------ Operations on bin maps ----------------------- */

/* bit corresponding to given index */
#define idx2bit(i)              ((binmap_t)(1) << (i))

/* Mark/Clear bits with given index */
#define mark_smallmap(M,i)      ((M)->smallmap |=  idx2bit(i))
#define clear_smallmap(M,i)     ((M)->smallmap &= ~idx2bit(i))
#define smallmap_is_marked(M,i) ((M)->smallmap &   idx2bit(i))

#define mark_treemap(M,i)       ((M)->treemap  |=  idx2bit(i))
#define clear_treemap(M,i)      ((M)->treemap  &= ~idx2bit(i))
#define treemap_is_marked(M,i)  ((M)->treemap  &   idx2bit(i))

/* isolate the least set bit of a bitmap */
#define least_bit(x)         ((x) & -(x))

/* mask with all bits to left of least bit of x on */
#define left_bits(x)         ((x<<1) | -(x<<1))

/* mask with all bits to left of or equal to least bit of x on */
#define same_or_left_bits(x) ((x) | -(x))

/* index corresponding to given bit. Use x86 asm if possible */

#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
#define compute_bit2idx(X, I)\
{\
  unsigned int J;\
  J = __builtin_ctz(X); \
  I = (bindex_t)J;\
}

#elif defined (__INTEL_COMPILER)
#define compute_bit2idx(X, I)\
{\
  unsigned int J;\
  J = _bit_scan_forward (X); \
  I = (bindex_t)J;\
}

#elif defined(_MSC_VER) && _MSC_VER>=1300
#define compute_bit2idx(X, I)\
{\
  unsigned int J;\
  _BitScanForward((DWORD *) &J, X);\
  I = (bindex_t)J;\
}

#elif USE_BUILTIN_FFS
#define compute_bit2idx(X, I) I = ffs(X)-1

#else
#define compute_bit2idx(X, I)\
{\
  unsigned int Y = X - 1;\
  unsigned int K = Y >> (16-4) & 16;\
  unsigned int N = K;        Y >>= K;\
  N += K = Y >> (8-3) &  8;  Y >>= K;\
  N += K = Y >> (4-2) &  4;  Y >>= K;\
  N += K = Y >> (2-1) &  2;  Y >>= K;\
  N += K = Y >> (1-0) &  1;  Y >>= K;\
  I = (bindex_t)(N + Y);\
}
#endif /* GNUC */


/* ----------------------- Runtime Check Support ------------------------- */

/*
  For security, the main invariant is that malloc/free/etc never
  writes to a static address other than malloc_state, unless static
  malloc_state itself has been corrupted, which cannot occur via
  malloc (because of these checks). In essence this means that we
  believe all pointers, sizes, maps etc held in malloc_state, but
  check all of those linked or offsetted from other embedded data
  structures.  These checks are interspersed with main code in a way
  that tends to minimize their run-time cost.

  When FOOTERS is defined, in addition to range checking, we also
  verify footer fields of inuse chunks, which can be used guarantee
  that the mstate controlling malloc/free is intact.  This is a
  streamlined version of the approach described by William Robertson
  et al in "Run-time Detection of Heap-based Overflows" LISA'03
  http://www.usenix.org/events/lisa03/tech/robertson.html The footer
  of an inuse chunk holds the xor of its mstate and a random seed,
  that is checked upon calls to free() and realloc().  This is
  (probabalistically) unguessable from outside the program, but can be
  computed by any code successfully malloc'ing any chunk, so does not
  itself provide protection against code that has already broken
  security through some other means.  Unlike Robertson et al, we
  always dynamically check addresses of all offset chunks (previous,
  next, etc). This turns out to be cheaper than relying on hashes.
*/

#if !INSECURE
/* Check if address a is at least as high as any from MORECORE or MMAP */
#define ok_address(M, a) ((char*)(a) >= (M)->least_addr)
/* Check if address of next chunk n is higher than base chunk p */
#define ok_next(p, n)    ((char*)(p) < (char*)(n))
/* Check if p has inuse status */
#define ok_inuse(p)     is_inuse(p)
/* Check if p has its pinuse bit on */
#define ok_pinuse(p)     pinuse(p)

#else /* !INSECURE */
#define ok_address(M, a) (1)
#define ok_next(b, n)    (1)
#define ok_inuse(p)      (1)
#define ok_pinuse(p)     (1)
#endif /* !INSECURE */

#if (FOOTERS && !INSECURE)
/* Check if (alleged) mstate m has expected magic field */
#define ok_magic(M)      ((M)->magic == mparams.magic)
#else  /* (FOOTERS && !INSECURE) */
#define ok_magic(M)      (1)
#endif /* (FOOTERS && !INSECURE) */

/* In gcc, use __builtin_expect to minimize impact of checks */
#if !INSECURE
#if defined(__GNUC__) && __GNUC__ >= 3
#define RTCHECK(e)  __builtin_expect(e, 1)
#else /* GNUC */
#define RTCHECK(e)  (e)
#endif /* GNUC */
#else /* !INSECURE */
#define RTCHECK(e)  (1)
#endif /* !INSECURE */

/* macros to set up inuse chunks with or without footers */

#if !FOOTERS

#define mark_inuse_foot(M,p,s)

/* Macros for setting head/foot of non-mmapped chunks */

/* Set cinuse bit and pinuse bit of next chunk */
#define set_inuse(M,p,s)\
  ((p)->head = (((p)->head & PINUSE_BIT)|s|CINUSE_BIT),\
  ((mchunkptr)(((char*)(p)) + (s)))->head |= PINUSE_BIT)

/* Set cinuse and pinuse of this chunk and pinuse of next chunk */
#define set_inuse_and_pinuse(M,p,s)\
  ((p)->head = (s|PINUSE_BIT|CINUSE_BIT),\
  ((mchunkptr)(((char*)(p)) + (s)))->head |= PINUSE_BIT)

/* Set size, cinuse and pinuse bit of this chunk */
#define set_size_and_pinuse_of_inuse_chunk(M, p, s)\
  ((p)->head = (s|PINUSE_BIT|CINUSE_BIT))

#else /* FOOTERS */

/* Set foot of inuse chunk to be xor of mstate and seed */
#define mark_inuse_foot(M,p,s)\
  (((mchunkptr)((char*)(p) + (s)))->prev_foot = ((size_t)(M) ^ mparams.magic))

#define get_mstate_for(p)\
  ((mstate)(((mchunkptr)((char*)(p) +\
    (chunksize(p))))->prev_foot ^ mparams.magic))

#define set_inuse(M,p,s)\
  ((p)->head = (((p)->head & PINUSE_BIT)|s|CINUSE_BIT),\
  (((mchunkptr)(((char*)(p)) + (s)))->head |= PINUSE_BIT), \
  mark_inuse_foot(M,p,s))

#define set_inuse_and_pinuse(M,p,s)\
  ((p)->head = (s|PINUSE_BIT|CINUSE_BIT),\
  (((mchunkptr)(((char*)(p)) + (s)))->head |= PINUSE_BIT),\
 mark_inuse_foot(M,p,s))

#define set_size_and_pinuse_of_inuse_chunk(M, p, s)\
  ((p)->head = (s|PINUSE_BIT|CINUSE_BIT),\
  mark_inuse_foot(M, p, s))

#endif /* !FOOTERS */

/* ---------------------------- setting mparams -------------------------- */

#if LOCK_AT_FORK
static void pre_fork(void)         { ACQUIRE_LOCK(&(gm)->mutex); }
static void post_fork_parent(void) { RELEASE_LOCK(&(gm)->mutex); }
static void post_fork_child(void)  { INITIAL_LOCK(&(gm)->mutex); }
#endif /* LOCK_AT_FORK */

/* Initialize mparams */
static int init_mparams(void) {
#ifdef NEED_GLOBAL_LOCK_INIT
  if (malloc_global_mutex_status <= 0)
    init_malloc_global_mutex();
#endif

  ACQUIRE_MALLOC_GLOBAL_LOCK();
  if (mparams.magic == 0) {
    size_t magic;
    size_t psize;
    size_t gsize;

#ifndef WIN32
    psize = malloc_getpagesize;
    gsize = ((DEFAULT_GRANULARITY != 0)? DEFAULT_GRANULARITY : psize);
#else /* WIN32 */
    {
      SYSTEM_INFO system_info;
      GetSystemInfo(&system_info);
      psize = system_info.dwPageSize;
      gsize = ((DEFAULT_GRANULARITY != 0)?
               DEFAULT_GRANULARITY : system_info.dwAllocationGranularity);
    }
#endif /* WIN32 */

    /* Sanity-check configuration:
       size_t must be unsigned and as wide as pointer type.
       ints must be at least 4 bytes.
       alignment must be at least 8.
       Alignment, min chunk size, and page size must all be powers of 2.
    */
    if ((sizeof(size_t) != sizeof(char*)) ||
        (MAX_SIZE_T < MIN_CHUNK_SIZE)  ||
        (sizeof(int) < 4)  ||
        (MALLOC_ALIGNMENT < (size_t)8U) ||
        ((MALLOC_ALIGNMENT & (MALLOC_ALIGNMENT-SIZE_T_ONE)) != 0) ||
        ((MCHUNK_SIZE      & (MCHUNK_SIZE-SIZE_T_ONE))      != 0) ||
        ((gsize            & (gsize-SIZE_T_ONE))            != 0) ||
        ((psize            & (psize-SIZE_T_ONE))            != 0))
      ABORT;
    mparams.granularity = gsize;
    mparams.page_size = psize;
    mparams.mmap_threshold = DEFAULT_MMAP_THRESHOLD;
    mparams.trim_threshold = DEFAULT_TRIM_THRESHOLD;
#if MORECORE_CONTIGUOUS
    mparams.default_mflags = USE_LOCK_BIT|USE_MMAP_BIT;
#else  /* MORECORE_CONTIGUOUS */
    mparams.default_mflags = USE_LOCK_BIT|USE_MMAP_BIT|USE_NONCONTIGUOUS_BIT;
#endif /* MORECORE_CONTIGUOUS */

#if !ONLY_MSPACES
    /* Set up lock for main malloc area */
    gm->mflags = mparams.default_mflags;
    (void)INITIAL_LOCK(&gm->mutex);
#endif
#if LOCK_AT_FORK
    pthread_atfork(&pre_fork, &post_fork_parent, &post_fork_child);
#endif

    {
#if USE_DEV_RANDOM
      int fd;
      unsigned char buf[sizeof(size_t)];
      /* Try to use /dev/urandom, else fall back on using time */
      if ((fd = open("/dev/urandom", O_RDONLY)) >= 0 &&
          read(fd, buf, sizeof(buf)) == sizeof(buf)) {
        magic = *((size_t *) buf);
        close(fd);
      }
      else
#endif /* USE_DEV_RANDOM */
#ifdef WIN32
      magic = (size_t)(GetTickCount() ^ (size_t)0x55555555U);
#elif defined(LACKS_TIME_H)
      magic = (size_t)&magic ^ (size_t)0x55555555U;
#else
      magic = (size_t)(time(0) ^ (size_t)0x55555555U);
#endif
      magic |= (size_t)8U;    /* ensure nonzero */
      magic &= ~(size_t)7U;   /* improve chances of fault for bad values */
      /* Until memory modes commonly available, use volatile-write */
      (*(volatile size_t *)(&(mparams.magic))) = magic;
    }
  }

  RELEASE_MALLOC_GLOBAL_LOCK();
  return 1;
}

/* support for mallopt */
static int change_mparam(int param_number, int value) {
  size_t val;
  ensure_initialization();
  val = (value == -1)? MAX_SIZE_T : (size_t)value;
  switch(param_number) {
  case M_TRIM_THRESHOLD:
    mparams.trim_threshold = val;
    return 1;
  case M_GRANULARITY:
    if (val >= mparams.page_size && ((val & (val-1)) == 0)) {
      mparams.granularity = val;
      return 1;
    }
    else
      return 0;
  case M_MMAP_THRESHOLD:
    mparams.mmap_threshold = val;
    return 1;
  default:
    return 0;
  }
}

#if DEBUG
/* ------------------------- Debugging Support --------------------------- */

/* Check properties of any chunk, whether free, inuse, mmapped etc  */
static void do_check_any_chunk(mstate m, mchunkptr p) {
  assert((is_aligned(chunk2mem(p))) || (p->head == FENCEPOST_HEAD));
  assert(ok_address(m, p));
}

/* Check properties of top chunk */
static void do_check_top_chunk(mstate m, mchunkptr p) {
  msegmentptr sp = segment_holding(m, (char*)p);
  size_t  sz = p->head & ~INUSE_BITS; /* third-lowest bit can be set! */
  assert(sp != 0);
  assert((is_aligned(chunk2mem(p))) || (p->head == FENCEPOST_HEAD));
  assert(ok_address(m, p));
  assert(sz == m->topsize);
  assert(sz > 0);
  assert(sz == ((sp->base + sp->size) - (char*)p) - TOP_FOOT_SIZE);
  assert(pinuse(p));
  assert(!pinuse(chunk_plus_offset(p, sz)));
}

/* Check properties of (inuse) mmapped chunks */
static void do_check_mmapped_chunk(mstate m, mchunkptr p) {
  size_t  sz = chunksize(p);
  size_t len = (sz + (p->prev_foot) + MMAP_FOOT_PAD);
  assert(is_mmapped(p));
  assert(use_mmap(m));
  assert((is_aligned(chunk2mem(p))) || (p->head == FENCEPOST_HEAD));
  assert(ok_address(m, p));
  assert(!is_small(sz));
  assert((len & (mparams.page_size-SIZE_T_ONE)) == 0);
  assert(chunk_plus_offset(p, sz)->head == FENCEPOST_HEAD);
  assert(chunk_plus_offset(p, sz+SIZE_T_SIZE)->head == 0);
}

/* Check properties of inuse chunks */
static void do_check_inuse_chunk(mstate m, mchunkptr p) {
  do_check_any_chunk(m, p);
  assert(is_inuse(p));
  assert(next_pinuse(p));
  /* If not pinuse and not mmapped, previous chunk has OK offset */
  assert(is_mmapped(p) || pinuse(p) || next_chunk(prev_chunk(p)) == p);
  if (is_mmapped(p))
    do_check_mmapped_chunk(m, p);
}

/* Check properties of free chunks */
static void do_check_free_chunk(mstate m, mchunkptr p) {
  size_t sz = chunksize(p);
  mchunkptr next = chunk_plus_offset(p, sz);
  do_check_any_chunk(m, p);
  assert(!is_inuse(p));
  assert(!next_pinuse(p));
  assert (!is_mmapped(p));
  if (p != m->dv && p != m->top) {
    if (sz >= MIN_CHUNK_SIZE) {
      assert((sz & CHUNK_ALIGN_MASK) == 0);
      assert(is_aligned(chunk2mem(p)));
      assert(next->prev_foot == sz);
      assert(pinuse(p));
      assert (next == m->top || is_inuse(next));
      assert(p->fd->bk == p);
      assert(p->bk->fd == p);
    }
    else  /* markers are always of size SIZE_T_SIZE */
      assert(sz == SIZE_T_SIZE);
  }
}

/* Check properties of malloced chunks at the point they are malloced */
static void do_check_malloced_chunk(mstate m, void* mem, size_t s) {
  if (mem != 0) {
    mchunkptr p = mem2chunk(mem);
    size_t sz = p->head & ~INUSE_BITS;
    do_check_inuse_chunk(m, p);
    assert((sz & CHUNK_ALIGN_MASK) == 0);
    assert(sz >= MIN_CHUNK_SIZE);
    assert(sz >= s);
    /* unless mmapped, size is less than MIN_CHUNK_SIZE more than request */
    assert(is_mmapped(p) || sz < (s + MIN_CHUNK_SIZE));
  }
}

/* Check a tree and its subtrees.  */
static void do_check_tree(mstate m, tchunkptr t) {
  tchunkptr head = 0;
  tchunkptr u = t;
  bindex_t tindex = t->index;
  size_t tsize = chunksize(t);
  bindex_t idx;
  compute_tree_index(tsize, idx);
  assert(tindex == idx);
  assert(tsize >= MIN_LARGE_SIZE);
  assert(tsize >= minsize_for_tree_index(idx));
  assert((idx == NTREEBINS-1) || (tsize < minsize_for_tree_index((idx+1))));

  do { /* traverse through chain of same-sized nodes */
    do_check_any_chunk(m, ((mchunkptr)u));
    assert(u->index == tindex);
    assert(chunksize(u) == tsize);
    assert(!is_inuse(u));
    assert(!next_pinuse(u));
    assert(u->fd->bk == u);
    assert(u->bk->fd == u);
    if (u->parent == 0) {
      assert(u->child[0] == 0);
      assert(u->child[1] == 0);
    }
    else {
      assert(head == 0); /* only one node on chain has parent */
      head = u;
      assert(u->parent != u);
      assert (u->parent->child[0] == u ||
              u->parent->child[1] == u ||
              *((tbinptr*)(u->parent)) == u);
      if (u->child[0] != 0) {
        assert(u->child[0]->parent == u);
        assert(u->child[0] != u);
        do_check_tree(m, u->child[0]);
      }
      if (u->child[1] != 0) {
        assert(u->child[1]->parent == u);
        assert(u->child[1] != u);
        do_check_tree(m, u->child[1]);
      }
      if (u->child[0] != 0 && u->child[1] != 0) {
        assert(chunksize(u->child[0]) < chunksize(u->child[1]));
      }
    }
    u = u->fd;
  } while (u != t);
  assert(head != 0);
}

/*  Check all the chunks in a treebin.  */
static void do_check_treebin(mstate m, bindex_t i) {
  tbinptr* tb = treebin_at(m, i);
  tchunkptr t = *tb;
  int empty = (m->treemap & (1U << i)) == 0;
  if (t == 0)
    assert(empty);
  if (!empty)
    do_check_tree(m, t);
}

/*  Check all the chunks in a smallbin.  */
static void do_check_smallbin(mstate m, bindex_t i) {
  sbinptr b = smallbin_at(m, i);
  mchunkptr p = b->bk;
  unsigned int empty = (m->smallmap & (1U << i)) == 0;
  if (p == b)
    assert(empty);
  if (!empty) {
    for (; p != b; p = p->bk) {
      size_t size = chunksize(p);
      mchunkptr q;
      /* each chunk claims to be free */
      do_check_free_chunk(m, p);
      /* chunk belongs in bin */
      assert(small_index(size) == i);
      assert(p->bk == b || chunksize(p->bk) == chunksize(p));
      /* chunk is followed by an inuse chunk */
      q = next_chunk(p);
      if (q->head != FENCEPOST_HEAD)
        do_check_inuse_chunk(m, q);
    }
  }
}

/* Find x in a bin. Used in other check functions. */
static int bin_find(mstate m, mchunkptr x) {
  size_t size = chunksize(x);
  if (is_small(size)) {
    bindex_t sidx = small_index(size);
    sbinptr b = smallbin_at(m, sidx);
    if (smallmap_is_marked(m, sidx)) {
      mchunkptr p = b;
      do {
        if (p == x)
          return 1;
      } while ((p = p->fd) != b);
    }
  }
  else {
    bindex_t tidx;
    compute_tree_index(size, tidx);
    if (treemap_is_marked(m, tidx)) {
      tchunkptr t = *treebin_at(m, tidx);
      size_t sizebits = size << leftshift_for_tree_index(tidx);
      while (t != 0 && chunksize(t) != size) {
        t = t->child[(sizebits >> (SIZE_T_BITSIZE-SIZE_T_ONE)) & 1];
        sizebits <<= 1;
      }
      if (t != 0) {
        tchunkptr u = t;
        do {
          if (u == (tchunkptr)x)
            return 1;
        } while ((u = u->fd) != t);
      }
    }
  }
  return 0;
}

/* Traverse each chunk and check it; return total */
static size_t traverse_and_check(mstate m) {
  size_t sum = 0;
  if (is_initialized(m)) {
    msegmentptr s = &m->seg;
    sum += m->topsize + TOP_FOOT_SIZE;
    while (s != 0) {
      mchunkptr q = align_as_chunk(s->base);
      mchunkptr lastq = 0;
      assert(pinuse(q));
      while (segment_holds(s, q) &&
             q != m->top && q->head != FENCEPOST_HEAD) {
        sum += chunksize(q);
        if (is_inuse(q)) {
          assert(!bin_find(m, q));
          do_check_inuse_chunk(m, q);
        }
        else {
          assert(q == m->dv || bin_find(m, q));
          assert(lastq == 0 || is_inuse(lastq)); /* Not 2 consecutive free */
          do_check_free_chunk(m, q);
        }
        lastq = q;
        q = next_chunk(q);
      }
      s = s->next;
    }
  }
  return sum;
}


/* Check all properties of malloc_state. */
static void do_check_malloc_state(mstate m) {
  bindex_t i;
  size_t total;
  /* check bins */
  for (i = 0; i < NSMALLBINS; ++i)
    do_check_smallbin(m, i);
  for (i = 0; i < NTREEBINS; ++i)
    do_check_treebin(m, i);

  if (m->dvsize != 0) { /* check dv chunk */
    do_check_any_chunk(m, m->dv);
    assert(m->dvsize == chunksize(m->dv));
    assert(m->dvsize >= MIN_CHUNK_SIZE);
    assert(bin_find(m, m->dv) == 0);
  }

  if (m->top != 0) {   /* check top chunk */
    do_check_top_chunk(m, m->top);
    /*assert(m->topsize == chunksize(m->top)); redundant */
    assert(m->topsize > 0);
    assert(bin_find(m, m->top) == 0);
  }

  total = traverse_and_check(m);
  assert(total <= m->footprint);
  assert(m->footprint <= m->max_footprint);
}
#endif /* DEBUG */

/* ----------------------------- statistics ------------------------------ */

#if !NO_MALLINFO
static struct mallinfo internal_mallinfo(mstate m) {
  struct mallinfo nm = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
  ensure_initialization();
  if (!PREACTION(m)) {
    check_malloc_state(m);
    if (is_initialized(m)) {
      size_t nfree = SIZE_T_ONE; /* top always free */
      size_t mfree = m->topsize + TOP_FOOT_SIZE;
      size_t sum = mfree;
      msegmentptr s = &m->seg;
      while (s != 0) {
        mchunkptr q = align_as_chunk(s->base);
        while (segment_holds(s, q) &&
               q != m->top && q->head != FENCEPOST_HEAD) {
          size_t sz = chunksize(q);
          sum += sz;
          if (!is_inuse(q)) {
            mfree += sz;
            ++nfree;
          }
          q = next_chunk(q);
        }
        s = s->next;
      }

      nm.arena    = sum;
      nm.ordblks  = nfree;
      nm.hblkhd   = m->footprint - sum;
      nm.usmblks  = m->max_footprint;
      nm.uordblks = m->footprint - mfree;
      nm.fordblks = mfree;
      nm.keepcost = m->topsize;
    }

    POSTACTION(m);
  }
  return nm;
}
#endif /* !NO_MALLINFO */

#if !NO_MALLOC_STATS
static void internal_malloc_stats(mstate m) {
  ensure_initialization();
  if (!PREACTION(m)) {
    size_t maxfp = 0;
    size_t fp = 0;
    size_t used = 0;
    check_malloc_state(m);
    if (is_initialized(m)) {
      msegmentptr s = &m->seg;
      maxfp = m->max_footprint;
      fp = m->footprint;
      used = fp - (m->topsize + TOP_FOOT_SIZE);

      while (s != 0) {
        mchunkptr q = align_as_chunk(s->base);
        while (segment_holds(s, q) &&
               q != m->top && q->head != FENCEPOST_HEAD) {
          if (!is_inuse(q))
            used -= chunksize(q);
          q = next_chunk(q);
        }
        s = s->next;
      }
    }
    POSTACTION(m); /* drop lock */
    fprintf(stderr, "max system bytes = %10lu\n", (unsigned long)(maxfp));
    fprintf(stderr, "system bytes     = %10lu\n", (unsigned long)(fp));
    fprintf(stderr, "in use bytes     = %10lu\n", (unsigned long)(used));
  }
}
#endif /* NO_MALLOC_STATS */

/* ----------------------- Operations on smallbins ----------------------- */

/*
  Various forms of linking and unlinking are defined as macros.  Even
  the ones for trees, which are very long but have very short typical
  paths.  This is ugly but reduces reliance on inlining support of
  compilers.
*/

/* Link a free chunk into a smallbin  */
#define insert_small_chunk(M, P, S) {\
  bindex_t I  = small_index(S);\
  mchunkptr B = smallbin_at(M, I);\
  mchunkptr F = B;\
  assert(S >= MIN_CHUNK_SIZE);\
  if (!smallmap_is_marked(M, I))\
    mark_smallmap(M, I);\
  else if (RTCHECK(ok_address(M, B->fd.pload())))\
    F = B->fd;\
  else {\
    CORRUPTION_ERROR_ACTION(M);\
  }\
  B->fd = P;\
  F->bk = P;\
  P->fd = F;\
  P->bk = B;\
}

/* Unlink a chunk from a smallbin  */
#define unlink_small_chunk(M, P, S) {\
  mchunkptr F = P->fd;\
  mchunkptr B = P->bk;\
  bindex_t I = small_index(S);\
  assert(P != B);\
  assert(P != F);\
  assert(chunksize(P) == small_index2size(I));\
  if (RTCHECK(F == smallbin_at(M,I) || (ok_address(M, F) && F->bk == P))) { \
    if (B == F) {\
      clear_smallmap(M, I);\
    }\
    else if (RTCHECK(B == smallbin_at(M,I) ||\
                     (ok_address(M, B) && B->fd == P))) {\
      F->bk = B;\
      B->fd = F;\
    }\
    else {\
      CORRUPTION_ERROR_ACTION(M);\
    }\
  }\
  else {\
    CORRUPTION_ERROR_ACTION(M);\
  }\
}

/* Unlink the first chunk from a smallbin */
#define unlink_first_small_chunk(M, B, P, I) {\
  mchunkptr F = P->fd;\
  assert(P != B);\
  assert(P != F);\
  assert(chunksize(P) == small_index2size(I));\
  if (B == F) {\
    clear_smallmap(M, I);\
  }\
  else if (RTCHECK(ok_address(M, F) && F->bk == P)) {\
    F->bk = B;\
    B->fd = F;\
  }\
  else {\
    CORRUPTION_ERROR_ACTION(M);\
  }\
}

/* Replace dv node, binning the old one */
/* Used only when dvsize known to be small */
#define replace_dv(M, P, S) {\
  size_t DVS = M->dvsize;\
  assert(is_small(DVS));\
  if (DVS != 0) {\
    mchunkptr DV = M->dv;\
    insert_small_chunk(M, DV, DVS);\
  }\
  M->dvsize = S;\
  M->dv = P;\
}

/* ------------------------- Operations on trees ------------------------- */

/* Insert chunk into tree */
#define insert_large_chunk(M, X, S) {\
  persist<tbinptr>* H;\
  bindex_t I;\
  compute_tree_index(S, I);\
  H = treebin_at(M, I);\
  X->index = I;\
  X->child[0] = X->child[1] = 0;\
  if (!treemap_is_marked(M, I)) {\
    mark_treemap(M, I);\
    *H = X;\
    X->parent = (tchunkptr)H;\
    X->fd = X->bk = X;\
  }\
  else {\
    tchunkptr T = *H;\
    size_t K = S << leftshift_for_tree_index(I);\
    for (;;) {\
      if (chunksize(T) != S) {\
        persist<tchunkptr>* C = (persist<tchunkptr>*)&(T->child[(K >> (SIZE_T_BITSIZE-SIZE_T_ONE)) & 1]);\
        K <<= 1;\
        if (*C != 0)\
          T = *C;\
        else if (RTCHECK(ok_address(M, C))) {\
          *C = X;\
          X->parent = T;\
          X->fd = X->bk = X;\
          break;\
        }\
        else {\
          CORRUPTION_ERROR_ACTION(M);\
          break;\
        }\
      }\
      else {\
        tchunkptr F = T->fd;\
        if (RTCHECK(ok_address(M, T) && ok_address(M, F))) {\
          T->fd = F->bk = X;\
          X->fd = F;\
          X->bk = T;\
          X->parent = 0;\
          break;\
        }\
        else {\
          CORRUPTION_ERROR_ACTION(M);\
          break;\
        }\
      }\
    }\
  }\
}

/*
  Unlink steps:

  1. If x is a chained node, unlink it from its same-sized fd/bk links
     and choose its bk node as its replacement.
  2. If x was the last node of its size, but not a leaf node, it must
     be replaced with a leaf node (not merely one with an open left or
     right), to make sure that lefts and rights of descendents
     correspond properly to bit masks.  We use the rightmost descendent
     of x.  We could use any other leaf, but this is easy to locate and
     tends to counteract removal of leftmosts elsewhere, and so keeps
     paths shorter than minimally guaranteed.  This doesn't loop much
     because on average a node in a tree is near the bottom.
  3. If x is the base of a chain (i.e., has parent links) relink
     x's parent and children to x's replacement (or null if none).
*/

#define unlink_large_chunk(M, X) {\
  tchunkptr XP = X->parent;\
  tchunkptr R;\
  if (X->bk != X) {\
    tchunkptr F = X->fd;\
    R = X->bk;\
    if (RTCHECK(ok_address(M, F) && F->bk == X && R->fd == X)) {\
      F->bk = R;\
      R->fd = F;\
    }\
    else {\
      CORRUPTION_ERROR_ACTION(M);\
    }\
  }\
  else {\
    persist<tchunkptr>* RP;\
    if (((R = *(RP = (persist<tchunkptr>*)&(X->child[1]))) != 0) ||\
        ((R = *(RP = (persist<tchunkptr>*)&(X->child[0]))) != 0)) {\
      persist<tchunkptr>* CP;\
      while ((*(CP = (persist<tchunkptr>*)&(R->child[1])) != 0) ||\
             (*(CP = (persist<tchunkptr>*)&(R->child[0])) != 0)) {\
        R = *(RP = CP);\
      }\
      if (RTCHECK(ok_address(M, RP)))\
        *RP = 0;\
      else {\
        CORRUPTION_ERROR_ACTION(M);\
      }\
    }\
  }\
  if (XP != 0) {\
    persist<tbinptr>* H = treebin_at(M, X->index);\
    if (X == *H) {\
      if ((*H = R) == 0) \
        clear_treemap(M, X->index);\
    }\
    else if (RTCHECK(ok_address(M, XP))) {\
      if (XP->child[0] == X) \
        XP->child[0] = R;\
      else \
        XP->child[1] = R;\
    }\
    else\
      CORRUPTION_ERROR_ACTION(M);\
    if (R != 0) {\
      if (RTCHECK(ok_address(M, R))) {\
        tchunkptr C0, C1;\
        R->parent = XP;\
        if ((C0 = X->child[0]) != 0) {\
          if (RTCHECK(ok_address(M, C0))) {\
            R->child[0] = C0;\
            C0->parent = R;\
          }\
          else\
            CORRUPTION_ERROR_ACTION(M);\
        }\
        if ((C1 = X->child[1]) != 0) {\
          if (RTCHECK(ok_address(M, C1))) {\
            R->child[1] = C1;\
            C1->parent = R;\
          }\
          else\
            CORRUPTION_ERROR_ACTION(M);\
        }\
      }\
      else\
        CORRUPTION_ERROR_ACTION(M);\
    }\
  }\
}

/* Relays to large vs small bin operations */

#define insert_chunk(M, P, S)\
  if (is_small(S)) insert_small_chunk(M, P, S)\
  else { tchunkptr TP = (tchunkptr)(P); insert_large_chunk(M, TP, S); }

#define unlink_chunk(M, P, S)\
  if (is_small(S)) unlink_small_chunk(M, P, S)\
  else { tchunkptr TP = (tchunkptr)(P); unlink_large_chunk(M, TP); }


/* Relays to internal calls to malloc/free from realloc, memalign etc */

#if ONLY_MSPACES
#define internal_malloc(m, b) mspace_malloc(m, b)
#define internal_free(m, mem) mspace_free(m,mem);
#else /* ONLY_MSPACES */
#if MSPACES
#define internal_malloc(m, b)\
  ((m == gm)? dlmalloc(b) : mspace_malloc(m, b))
#define internal_free(m, mem)\
   if (m == gm) dlfree(mem); else mspace_free(m,mem);
#else /* MSPACES */
#define internal_malloc(m, b) dlmalloc(b)
#define internal_free(m, mem) dlfree(mem)
#endif /* MSPACES */
#endif /* ONLY_MSPACES */

/* -----------------------  Direct-mmapping chunks ----------------------- */

/*
  Directly mmapped chunks are set up with an offset to the start of
  the mmapped region stored in the prev_foot field of the chunk. This
  allows reconstruction of the required argument to MUNMAP when freed,
  and also allows adjustment of the returned chunk to meet alignment
  requirements (especially in memalign).
*/

/* Malloc using mmap */
static void* mmap_alloc(mstate m, size_t nb) {
  size_t mmsize = mmap_align(nb + SIX_SIZE_T_SIZES + CHUNK_ALIGN_MASK);
  if (m->footprint_limit.pload() != 0) {
    size_t fp = m->footprint + mmsize;
    if (fp <= m->footprint || fp > m->footprint_limit)
      return 0;
  }
  if (mmsize > nb) {     /* Check for wrap around 0 */
    char* mm = (char*)(CALL_DIRECT_MMAP(mmsize));
    if (mm != CMFAIL) {
      size_t offset = align_offset(chunk2mem(mm));
      size_t psize = mmsize - offset - MMAP_FOOT_PAD;
      mchunkptr p = (mchunkptr)(mm + offset);
      p->prev_foot = offset;
      p->head = psize;
      mark_inuse_foot(m, p, psize);
      chunk_plus_offset(p, psize)->head = FENCEPOST_HEAD;
      chunk_plus_offset(p, psize+SIZE_T_SIZE)->head = 0;

      if (m->least_addr == 0 || mm < m->least_addr)
        m->least_addr = mm;
      if ((m->footprint += mmsize) > m->max_footprint)
        m->max_footprint = m->footprint;
      assert(is_aligned(chunk2mem(p)));
      check_mmapped_chunk(m, p);
      return chunk2mem(p);
    }
  }
  return 0;
}

/* Realloc using mmap */
static mchunkptr mmap_resize(mstate m, mchunkptr oldp, size_t nb, int flags) {
  size_t oldsize = chunksize(oldp);
  (void)flags; /* placate people compiling -Wunused */
  if (is_small(nb)) /* Can't shrink mmap regions below small size */
    return 0;
  /* Keep old chunk if big enough but not too big */
  if (oldsize >= nb + SIZE_T_SIZE &&
      (oldsize - nb) <= (mparams.granularity << 1))
    return oldp;
  else {
    size_t offset = oldp->prev_foot;
    size_t oldmmsize = oldsize + offset + MMAP_FOOT_PAD;
    size_t newmmsize = mmap_align(nb + SIX_SIZE_T_SIZES + CHUNK_ALIGN_MASK);
    char* cp = (char*)CALL_MREMAP((char*)oldp - offset,
                                  oldmmsize, newmmsize, flags);
    if (cp != CMFAIL) {
      mchunkptr newp = (mchunkptr)(cp + offset);
      size_t psize = newmmsize - offset - MMAP_FOOT_PAD;
      newp->head = psize;
      mark_inuse_foot(m, newp, psize);
      chunk_plus_offset(newp, psize)->head = FENCEPOST_HEAD;
      chunk_plus_offset(newp, psize+SIZE_T_SIZE)->head = 0;

      if (cp < m->least_addr)
        m->least_addr = cp;
      if ((m->footprint += newmmsize - oldmmsize) > m->max_footprint)
        m->max_footprint = m->footprint;
      check_mmapped_chunk(m, newp);
      return newp;
    }
  }
  return 0;
}


/* -------------------------- mspace management -------------------------- */

/* Initialize top chunk and its size */
static void init_top(mstate m, mchunkptr p, size_t psize) {
  /* Ensure alignment */
  size_t offset = align_offset(chunk2mem(p));
  p = (mchunkptr)((char*)p + offset);
  psize -= offset;

  m->top = p;
  m->topsize = psize;
  p->head = psize | PINUSE_BIT;
  /* set size of fake trailing chunk holding overhead space only once */
  chunk_plus_offset(p, psize)->head = TOP_FOOT_SIZE;
  m->trim_check = mparams.trim_threshold; /* reset on each update */
}

/* Initialize bins for a new mstate that is otherwise zeroed out */
static void init_bins(mstate m) {
  /* Establish circular links for smallbins */
  bindex_t i;
  for (i = 0; i < NSMALLBINS; ++i) {
    sbinptr bin = smallbin_at(m,i);
    bin->fd = bin->bk = bin;
  }
}

#if PROCEED_ON_ERROR

/* default corruption action */
static void reset_on_error(mstate m) {
  int i;
  ++malloc_corruption_error_count;
  /* Reinitialize fields to forget about all memory */
  m->smallmap = m->treemap = 0;
  m->dvsize = m->topsize = 0;
  m->seg.base = 0;
  m->seg.size = 0;
  m->seg.next = 0;
  m->top = m->dv = 0;
  for (i = 0; i < NTREEBINS; ++i)
    *treebin_at(m, i) = 0;
  init_bins(m);
}
#endif /* PROCEED_ON_ERROR */

/* Allocate chunk and prepend remainder with chunk in successor base. */
static void* prepend_alloc(mstate m, char* newbase, char* oldbase,
                           size_t nb) {
  mchunkptr p = align_as_chunk(newbase);
  mchunkptr oldfirst = align_as_chunk(oldbase);
  size_t psize = (char*)oldfirst - (char*)p;
  mchunkptr q = chunk_plus_offset(p, nb);
  size_t qsize = psize - nb;
  set_size_and_pinuse_of_inuse_chunk(m, p, nb);

  assert((char*)oldfirst > (char*)q);
  assert(pinuse(oldfirst));
  assert(qsize >= MIN_CHUNK_SIZE);

  /* consolidate remainder with first chunk of old base */
  if (oldfirst == m->top) {
    size_t tsize = m->topsize += qsize;
    m->top = q;
    q->head = tsize | PINUSE_BIT;
    check_top_chunk(m, q);
  }
  else if (oldfirst == m->dv) {
    size_t dsize = m->dvsize += qsize;
    m->dv = q;
    set_size_and_pinuse_of_free_chunk(q, dsize);
  }
  else {
    if (!is_inuse(oldfirst)) {
      size_t nsize = chunksize(oldfirst);
      unlink_chunk(m, oldfirst, nsize);
      oldfirst = chunk_plus_offset(oldfirst, nsize);
      qsize += nsize;
    }
    set_free_with_pinuse(q, qsize, oldfirst);
    insert_chunk(m, q, qsize);
    check_free_chunk(m, q);
  }

  check_malloced_chunk(m, chunk2mem(p), nb);
  return chunk2mem(p);
}

/* Add a segment to hold a new noncontiguous region */
static void add_segment(mstate m, char* tbase, size_t tsize, flag_t mmapped) {
  /* Determine locations and sizes of segment, fenceposts, old top */
  char* old_top = (char*)m->top.pload();
  msegmentptr oldsp = segment_holding(m, old_top);
  char* old_end = oldsp->base + oldsp->size;
  size_t ssize = pad_request(sizeof(struct malloc_segment));
  char* rawsp = old_end - (ssize + FOUR_SIZE_T_SIZES + CHUNK_ALIGN_MASK);
  size_t offset = align_offset(chunk2mem(rawsp));
  char* asp = rawsp + offset;
  char* csp = (asp < (old_top + MIN_CHUNK_SIZE))? old_top : asp;
  mchunkptr sp = (mchunkptr)csp;
  msegmentptr ss = (msegmentptr)(chunk2mem(sp));
  mchunkptr tnext = chunk_plus_offset(sp, ssize);
  mchunkptr p = tnext;
  int nfences = 0;

  /* reset top to new space */
  init_top(m, (mchunkptr)tbase, tsize - TOP_FOOT_SIZE);

  /* Set up segment record */
  assert(is_aligned(ss));
  set_size_and_pinuse_of_inuse_chunk(m, sp, ssize);
  *ss = m->seg; /* Push current record */
  m->seg.base = tbase;
  m->seg.size = tsize;
  m->seg.sflags = mmapped;
  m->seg.next = ss;

  /* Insert trailing fenceposts */
  for (;;) {
    mchunkptr nextp = chunk_plus_offset(p, SIZE_T_SIZE);
    p->head = FENCEPOST_HEAD;
    ++nfences;
    if ((char*)(&(nextp->head)) < old_end)
      p = nextp;
    else
      break;
  }
  assert(nfences >= 2);

  /* Insert the rest of old top into a bin as an ordinary free chunk */
  if (csp != old_top) {
    mchunkptr q = (mchunkptr)old_top;
    size_t psize = csp - old_top;
    mchunkptr tn = chunk_plus_offset(q, psize);
    set_free_with_pinuse(q, psize, tn);
    insert_chunk(m, q, psize);
  }

  check_top_chunk(m, m->top);
}

/* -------------------------- System allocation -------------------------- */

/* Get memory from system using MORECORE or MMAP */
static void* sys_alloc(mstate m, size_t nb) {
  char* tbase = CMFAIL;
  size_t tsize = 0;
  flag_t mmap_flag = 0;
  size_t asize; /* allocation size */

  ensure_initialization();

  /* Directly map large chunks, but only if already initialized */
  if (use_mmap(m) && nb >= mparams.mmap_threshold && m->topsize.pload() != 0) {
    void* mem = mmap_alloc(m, nb);
    if (mem != 0)
      return mem;
  }

  asize = granularity_align(nb + SYS_ALLOC_PADDING);
  if (asize <= nb)
    return 0; /* wraparound */
  if (m->footprint_limit.pload() != 0) {
    size_t fp = m->footprint + asize;
    if (fp <= m->footprint || fp > m->footprint_limit)
      return 0;
  }

  /*
    Try getting memory in any of three ways (in most-preferred to
    least-preferred order):
    1. A call to MORECORE that can normally contiguously extend memory.
       (disabled if not MORECORE_CONTIGUOUS or not HAVE_MORECORE or
       or main space is mmapped or a previous contiguous call failed)
    2. A call to MMAP new space (disabled if not HAVE_MMAP).
       Note that under the default settings, if MORECORE is unable to
       fulfill a request, and HAVE_MMAP is true, then mmap is
       used as a noncontiguous system allocator. This is a useful backup
       strategy for systems with holes in address spaces -- in this case
       sbrk cannot contiguously expand the heap, but mmap may be able to
       find space.
    3. A call to MORECORE that cannot usually contiguously extend memory.
       (disabled if not HAVE_MORECORE)

   In all cases, we need to request enough bytes from system to ensure
   we can malloc nb bytes upon success, so pad with enough space for
   top_foot, plus alignment-pad to make sure we don't lose bytes if
   not on boundary, and round this up to a granularity unit.
  */

  if (MORECORE_CONTIGUOUS && !use_noncontiguous(m)) {
    char* br = CMFAIL;
    size_t ssize = asize; /* sbrk call size */
    msegmentptr ss = (m->top == 0)? 0 : segment_holding(m, (char*)m->top.pload());
    ACQUIRE_MALLOC_GLOBAL_LOCK();

    if (ss == 0) {  /* First time through or recovery */
      char* base = (char*)CALL_MORECORE(0);
      if (base != CMFAIL) {
        size_t fp;
        /* Adjust to end on a page boundary */
        if (!is_page_aligned(base))
          ssize += (page_align((size_t)base) - (size_t)base);
        fp = m->footprint + ssize; /* recheck limits */
        if (ssize > nb && ssize < HALF_MAX_SIZE_T &&
            (m->footprint_limit.pload() == 0 ||
             (fp > m->footprint && fp <= m->footprint_limit)) &&
            (br = (char*)(CALL_MORECORE(ssize))) == base) {
          tbase = base;
          tsize = ssize;
        }
      }
    }
    else {
      /* Subtract out existing available top space from MORECORE request. */
      ssize = granularity_align(nb - m->topsize + SYS_ALLOC_PADDING);
      /* Use mem here only if it did continuously extend old space */
      if (ssize < HALF_MAX_SIZE_T &&
          (br = (char*)(CALL_MORECORE(ssize))) == ss->base+ss->size) {
        tbase = br;
        tsize = ssize;
      }
    }

    if (tbase == CMFAIL) {    /* Cope with partial failure */
      if (br != CMFAIL) {    /* Try to use/extend the space we did get */
        if (ssize < HALF_MAX_SIZE_T &&
            ssize < nb + SYS_ALLOC_PADDING) {
          size_t esize = granularity_align(nb + SYS_ALLOC_PADDING - ssize);
          if (esize < HALF_MAX_SIZE_T) {
            char* end = (char*)CALL_MORECORE(esize);
            if (end != CMFAIL)
              ssize += esize;
            else {            /* Can't use; try to release */
              (void) CALL_MORECORE(-ssize);
              br = CMFAIL;
            }
          }
        }
      }
      if (br != CMFAIL) {    /* Use the space we did get */
        tbase = br;
        tsize = ssize;
      }
      else
        disable_contiguous(m); /* Don't try contiguous path in the future */
    }

    RELEASE_MALLOC_GLOBAL_LOCK();
  }

  if (HAVE_MMAP && tbase == CMFAIL) {  /* Try MMAP */
    char* mp = (char*)(CALL_MMAP(asize));
    if (mp != CMFAIL) {
      tbase = mp;
      tsize = asize;
      mmap_flag = USE_MMAP_BIT;
    }
  }

  if (HAVE_MORECORE && tbase == CMFAIL) { /* Try noncontiguous MORECORE */
    if (asize < HALF_MAX_SIZE_T) {
      char* br = CMFAIL;
      char* end = CMFAIL;
      ACQUIRE_MALLOC_GLOBAL_LOCK();
      br = (char*)(CALL_MORECORE(asize));
      end = (char*)(CALL_MORECORE(0));
      RELEASE_MALLOC_GLOBAL_LOCK();
      if (br != CMFAIL && end != CMFAIL && br < end) {
        size_t ssize = end - br;
        if (ssize > nb + TOP_FOOT_SIZE) {
          tbase = br;
          tsize = ssize;
        }
      }
    }
  }

  if (tbase != CMFAIL) {

    if ((m->footprint += tsize) > m->max_footprint)
      m->max_footprint = m->footprint;

    if (!is_initialized(m)) { /* first-time initialization */
      if (m->least_addr == 0 || tbase < m->least_addr)
        m->least_addr = tbase;
      m->seg.base = tbase;
      m->seg.size = tsize;
      m->seg.sflags = mmap_flag;
      m->magic = mparams.magic;
      m->release_checks = MAX_RELEASE_CHECK_RATE;
      init_bins(m);
#if !ONLY_MSPACES
      if (is_global(m))
        init_top(m, (mchunkptr)tbase, tsize - TOP_FOOT_SIZE);
      else
#endif
      {
        /* Offset top by embedded malloc_state */
        mchunkptr mn = next_chunk(mem2chunk(m));
        init_top(m, mn, (size_t)((tbase + tsize) - (char*)mn) -TOP_FOOT_SIZE);
      }
    }

    else {
      /* Try to merge with an existing segment */
      msegmentptr sp = &m->seg;
      /* Only consider most recent segment if traversal suppressed */
      while (sp != 0 && tbase != sp->base + sp->size)
        sp = (NO_SEGMENT_TRAVERSAL) ? 0 : sp->next;
      if (sp != 0 &&
          !is_extern_segment(sp) &&
          (sp->sflags & USE_MMAP_BIT) == mmap_flag &&
          segment_holds(sp, m->top.pload())) { /* append */
        sp->size += tsize;
        init_top(m, m->top, m->topsize + tsize);
      }
      else {
        if (tbase < m->least_addr)
          m->least_addr = tbase;
        sp = &m->seg;
        while (sp != 0 && sp->base != tbase + tsize)
          sp = (NO_SEGMENT_TRAVERSAL) ? 0 : sp->next;
        if (sp != 0 &&
            !is_extern_segment(sp) &&
            (sp->sflags & USE_MMAP_BIT) == mmap_flag) {
          char* oldbase = sp->base;
          sp->base = tbase;
          sp->size += tsize;
          return prepend_alloc(m, tbase, oldbase, nb);
        }
        else
          add_segment(m, tbase, tsize, mmap_flag);
      }
    }

    if (nb < m->topsize) { /* Allocate from new or extended top space */
      size_t rsize = m->topsize -= nb;
      mchunkptr p = m->top;
      mchunkptr r = m->top = chunk_plus_offset(p, nb);
      r->head = rsize | PINUSE_BIT;
      set_size_and_pinuse_of_inuse_chunk(m, p, nb);
      check_top_chunk(m, m->top);
      check_malloced_chunk(m, chunk2mem(p), nb);
      return chunk2mem(p);
    }
  }

  MALLOC_FAILURE_ACTION;
  return 0;
}

/* -----------------------  system deallocation -------------------------- */

/* Unmap and unlink any mmapped segments that don't contain used chunks */
static size_t release_unused_segments(mstate m) {
  size_t released = 0;
  int nsegs = 0;
  msegmentptr pred = &m->seg;
  msegmentptr sp = pred->next;
  while (sp != 0) {
    char* base = sp->base;
    size_t size = sp->size;
    msegmentptr next = sp->next;
    ++nsegs;
    if (is_mmapped_segment(sp) && !is_extern_segment(sp)) {
      mchunkptr p = align_as_chunk(base);
      size_t psize = chunksize(p);
      /* Can unmap if first chunk holds entire segment and not pinned */
      if (!is_inuse(p) && (char*)p + psize >= base + size - TOP_FOOT_SIZE) {
        tchunkptr tp = (tchunkptr)p;
        assert(segment_holds(sp, (char*)sp));
        if (p == m->dv) {
          m->dv = 0;
          m->dvsize = 0;
        }
        else {
          unlink_large_chunk(m, tp);
        }
        if (CALL_MUNMAP(base, size) == 0) {
          released += size;
          m->footprint -= size;
          /* unlink obsoleted record */
          sp = pred;
          sp->next = next;
        }
        else { /* back out if cannot unmap */
          insert_large_chunk(m, tp, psize);
        }
      }
    }
    if (NO_SEGMENT_TRAVERSAL) /* scan only first segment */
      break;
    pred = sp;
    sp = next;
  }
  /* Reset check counter */
  m->release_checks = (((size_t) nsegs > (size_t) MAX_RELEASE_CHECK_RATE)?
                       (size_t) nsegs : (size_t) MAX_RELEASE_CHECK_RATE);
  return released;
}

static int sys_trim(mstate m, size_t pad) {
  size_t released = 0;
  ensure_initialization();
  if (pad < MAX_REQUEST && is_initialized(m)) {
    pad += TOP_FOOT_SIZE; /* ensure enough room for segment overhead */

    if (m->topsize > pad) {
      /* Shrink top space in granularity-size units, keeping at least one */
      size_t unit = mparams.granularity;
      size_t extra = ((m->topsize - pad + (unit - SIZE_T_ONE)) / unit -
                      SIZE_T_ONE) * unit;
      msegmentptr sp = segment_holding(m, (char*)m->top.pload());

      if (!is_extern_segment(sp)) {
        if (is_mmapped_segment(sp)) {
          if (HAVE_MMAP &&
              sp->size >= extra &&
              !has_segment_link(m, sp)) { /* can't shrink if pinned */
            size_t newsize = sp->size - extra;
            (void)newsize; /* placate people compiling -Wunused-variable */
            /* Prefer mremap, fall back to munmap */
            if ((CALL_MREMAP(sp->base, sp->size, newsize, 0) != MFAIL) ||
                (CALL_MUNMAP(sp->base + newsize, extra) == 0)) {
              released = extra;
            }
          }
        }
        else if (HAVE_MORECORE) {
          if (extra >= HALF_MAX_SIZE_T) /* Avoid wrapping negative */
            extra = (HALF_MAX_SIZE_T) + SIZE_T_ONE - unit;
          ACQUIRE_MALLOC_GLOBAL_LOCK();
          {
            /* Make sure end of memory is where we last set it. */
            char* old_br = (char*)(CALL_MORECORE(0));
            if (old_br == sp->base + sp->size) {
              char* rel_br = (char*)(CALL_MORECORE(-extra));
              char* new_br = (char*)(CALL_MORECORE(0));
              if (rel_br != CMFAIL && new_br < old_br)
                released = old_br - new_br;
            }
          }
          RELEASE_MALLOC_GLOBAL_LOCK();
        }
      }

      if (released != 0) {
        sp->size -= released;
        m->footprint -= released;
        init_top(m, m->top, m->topsize - released);
        check_top_chunk(m, m->top);
      }
    }

    /* Unmap any unused mmapped segments */
    if (HAVE_MMAP)
      released += release_unused_segments(m);

    /* On failure, disable autotrim to avoid repeated failed future calls */
    if (released == 0 && m->topsize > m->trim_check)
      m->trim_check = MAX_SIZE_T;
  }

  return (released != 0)? 1 : 0;
}

/* Consolidate and bin a chunk. Differs from exported versions
   of free mainly in that the chunk need not be marked as inuse.
*/
static void dispose_chunk(mstate m, mchunkptr p, size_t psize) {
  mchunkptr next = chunk_plus_offset(p, psize);
  if (!pinuse(p)) {
    mchunkptr prev;
    size_t prevsize = p->prev_foot;
    if (is_mmapped(p)) {
      psize += prevsize + MMAP_FOOT_PAD;
      if (CALL_MUNMAP((char*)p - prevsize, psize) == 0)
        m->footprint -= psize;
      return;
    }
    prev = chunk_minus_offset(p, prevsize);
    psize += prevsize;
    p = prev;
    if (RTCHECK(ok_address(m, prev))) { /* consolidate backward */
      if (p != m->dv) {
        unlink_chunk(m, p, prevsize);
      }
      else if ((next->head & INUSE_BITS) == INUSE_BITS) {
        m->dvsize = psize;
        set_free_with_pinuse(p, psize, next);
        return;
      }
    }
    else {
      CORRUPTION_ERROR_ACTION(m);
      return;
    }
  }
  if (RTCHECK(ok_address(m, next))) {
    if (!cinuse(next)) {  /* consolidate forward */
      if (next == m->top) {
        size_t tsize = m->topsize += psize;
        m->top = p;
        p->head = tsize | PINUSE_BIT;
        if (p == m->dv) {
          m->dv = 0;
          m->dvsize = 0;
        }
        return;
      }
      else if (next == m->dv) {
        size_t dsize = m->dvsize += psize;
        m->dv = p;
        set_size_and_pinuse_of_free_chunk(p, dsize);
        return;
      }
      else {
        size_t nsize = chunksize(next);
        psize += nsize;
        unlink_chunk(m, next, nsize);
        set_size_and_pinuse_of_free_chunk(p, psize);
        if (p == m->dv) {
          m->dvsize = psize;
          return;
        }
      }
    }
    else {
      set_free_with_pinuse(p, psize, next);
    }
    insert_chunk(m, p, psize);
  }
  else {
    CORRUPTION_ERROR_ACTION(m);
  }
}

/* ---------------------------- malloc --------------------------- */

/* allocate a large request from the best fitting chunk in a treebin */
static void* tmalloc_large(mstate m, size_t nb) {
  tchunkptr v = 0;
  size_t rsize = -nb; /* Unsigned negation */
  tchunkptr t;
  bindex_t idx;
  compute_tree_index(nb, idx);
  if ((t = *treebin_at(m, idx)) != 0) {
    /* Traverse tree for this bin looking for node with size == nb */
    size_t sizebits = nb << leftshift_for_tree_index(idx);
    tchunkptr rst = 0;  /* The deepest untaken right subtree */
    for (;;) {
      tchunkptr rt;
      size_t trem = chunksize(t) - nb;
      if (trem < rsize) {
        v = t;
        if ((rsize = trem) == 0)
          break;
      }
      rt = t->child[1];
      t = t->child[(sizebits >> (SIZE_T_BITSIZE-SIZE_T_ONE)) & 1];
      if (rt != 0 && rt != t)
        rst = rt;
      if (t == 0) {
        t = rst; /* set t to least subtree holding sizes > nb */
        break;
      }
      sizebits <<= 1;
    }
  }
  if (t == 0 && v == 0) { /* set t to root of next non-empty treebin */
    binmap_t leftbits = left_bits(idx2bit(idx)) & m->treemap;
    if (leftbits != 0) {
      bindex_t i;
      binmap_t leastbit = least_bit(leftbits);
      compute_bit2idx(leastbit, i);
      t = *treebin_at(m, i);
    }
  }

  while (t != 0) { /* find smallest of tree or subtree */
    size_t trem = chunksize(t) - nb;
    if (trem < rsize) {
      rsize = trem;
      v = t;
    }
    t = leftmost_child(t);
  }

  /*  If dv is a better fit, return 0 so malloc will use it */
  if (v != 0 && rsize < (size_t)(m->dvsize - nb)) {
    if (RTCHECK(ok_address(m, v))) { /* split */
      mchunkptr r = chunk_plus_offset(v, nb);
      assert(chunksize(v) == rsize + nb);
      if (RTCHECK(ok_next(v, r))) {
        unlink_large_chunk(m, v);
        if (rsize < MIN_CHUNK_SIZE)
          set_inuse_and_pinuse(m, v, (rsize + nb));
        else {
          set_size_and_pinuse_of_inuse_chunk(m, v, nb);
          set_size_and_pinuse_of_free_chunk(r, rsize);
          insert_chunk(m, r, rsize);
        }
        return chunk2mem(v);
      }
    }
    CORRUPTION_ERROR_ACTION(m);
  }
  return 0;
}

/* allocate a small request from the best fitting chunk in a treebin */
static void* tmalloc_small(mstate m, size_t nb) {
  tchunkptr t, v;
  size_t rsize;
  bindex_t i;
  binmap_t leastbit = least_bit(m->treemap);
  compute_bit2idx(leastbit, i);
  v = t = *treebin_at(m, i);
  rsize = chunksize(t) - nb;

  while ((t = leftmost_child(t)) != 0) {
    size_t trem = chunksize(t) - nb;
    if (trem < rsize) {
      rsize = trem;
      v = t;
    }
  }

  if (RTCHECK(ok_address(m, v))) {
    mchunkptr r = chunk_plus_offset(v, nb);
    assert(chunksize(v) == rsize + nb);
    if (RTCHECK(ok_next(v, r))) {
      unlink_large_chunk(m, v);
      if (rsize < MIN_CHUNK_SIZE)
        set_inuse_and_pinuse(m, v, (rsize + nb));
      else {
        set_size_and_pinuse_of_inuse_chunk(m, v, nb);
        set_size_and_pinuse_of_free_chunk(r, rsize);
        replace_dv(m, r, rsize);
      }
      return chunk2mem(v);
    }
  }

  CORRUPTION_ERROR_ACTION(m);
  return 0;
}

#if !ONLY_MSPACES

void* dlmalloc(size_t bytes) {
  /*
     Basic algorithm:
     If a small request (< 256 bytes minus per-chunk overhead):
       1. If one exists, use a remainderless chunk in associated smallbin.
          (Remainderless means that there are too few excess bytes to
          represent as a chunk.)
       2. If it is big enough, use the dv chunk, which is normally the
          chunk adjacent to the one used for the most recent small request.
       3. If one exists, split the smallest available chunk in a bin,
          saving remainder in dv.
       4. If it is big enough, use the top chunk.
       5. If available, get memory from system and use it
     Otherwise, for a large request:
       1. Find the smallest available binned chunk that fits, and use it
          if it is better fitting than dv chunk, splitting if necessary.
       2. If better fitting than any binned chunk, use the dv chunk.
       3. If it is big enough, use the top chunk.
       4. If request size >= mmap threshold, try to directly mmap this chunk.
       5. If available, get memory from system and use it

     The ugly goto's here ensure that postaction occurs along all paths.
  */

#if USE_LOCKS
  ensure_initialization(); /* initialize in sys_alloc if not using locks */
#endif

  if (!PREACTION(gm)) {
    void* mem;
    size_t nb;
    if (bytes <= MAX_SMALL_REQUEST) {
      bindex_t idx;
      binmap_t smallbits;
      nb = (bytes < MIN_REQUEST)? MIN_CHUNK_SIZE : pad_request(bytes);
      idx = small_index(nb);
      smallbits = gm->smallmap >> idx;

      if ((smallbits & 0x3U) != 0) { /* Remainderless fit to a smallbin. */
        mchunkptr b, p;
        idx += ~smallbits & 1;       /* Uses next bin if idx empty */
        b = smallbin_at(gm, idx);
        p = b->fd;
        assert(chunksize(p) == small_index2size(idx));
        unlink_first_small_chunk(gm, b, p, idx);
        set_inuse_and_pinuse(gm, p, small_index2size(idx));
        mem = chunk2mem(p);
        check_malloced_chunk(gm, mem, nb);
        goto postaction;
      }

      else if (nb > gm->dvsize) {
        if (smallbits != 0) { /* Use chunk in next nonempty smallbin */
          mchunkptr b, p, r;
          size_t rsize;
          bindex_t i;
          binmap_t leftbits = (smallbits << idx) & left_bits(idx2bit(idx));
          binmap_t leastbit = least_bit(leftbits);
          compute_bit2idx(leastbit, i);
          b = smallbin_at(gm, i);
          p = b->fd;
          assert(chunksize(p) == small_index2size(i));
          unlink_first_small_chunk(gm, b, p, i);
          rsize = small_index2size(i) - nb;
          /* Fit here cannot be remainderless if 4byte sizes */
          if (SIZE_T_SIZE != 4 && rsize < MIN_CHUNK_SIZE)
            set_inuse_and_pinuse(gm, p, small_index2size(i));
          else {
            set_size_and_pinuse_of_inuse_chunk(gm, p, nb);
            r = chunk_plus_offset(p, nb);
            set_size_and_pinuse_of_free_chunk(r, rsize);
            replace_dv(gm, r, rsize);
          }
          mem = chunk2mem(p);
          check_malloced_chunk(gm, mem, nb);
          goto postaction;
        }

        else if (gm->treemap != 0 && (mem = tmalloc_small(gm, nb)) != 0) {
          check_malloced_chunk(gm, mem, nb);
          goto postaction;
        }
      }
    }
    else if (bytes >= MAX_REQUEST)
      nb = MAX_SIZE_T; /* Too big to allocate. Force failure (in sys alloc) */
    else {
      nb = pad_request(bytes);
      if (gm->treemap != 0 && (mem = tmalloc_large(gm, nb)) != 0) {
        check_malloced_chunk(gm, mem, nb);
        goto postaction;
      }
    }

    if (nb <= gm->dvsize) {
      size_t rsize = gm->dvsize - nb;
      mchunkptr p = gm->dv;
      if (rsize >= MIN_CHUNK_SIZE) { /* split dv */
        mchunkptr r = gm->dv = chunk_plus_offset(p, nb);
        gm->dvsize = rsize;
        set_size_and_pinuse_of_free_chunk(r, rsize);
        set_size_and_pinuse_of_inuse_chunk(gm, p, nb);
      }
      else { /* exhaust dv */
        size_t dvs = gm->dvsize;
        gm->dvsize = 0;
        gm->dv = 0;
        set_inuse_and_pinuse(gm, p, dvs);
      }
      mem = chunk2mem(p);
      check_malloced_chunk(gm, mem, nb);
      goto postaction;
    }

    else if (nb < gm->topsize) { /* Split top */
      size_t rsize = gm->topsize -= nb;
      mchunkptr p = gm->top;
      mchunkptr r = gm->top = chunk_plus_offset(p, nb);
      r->head = rsize | PINUSE_BIT;
      set_size_and_pinuse_of_inuse_chunk(gm, p, nb);
      mem = chunk2mem(p);
      check_top_chunk(gm, gm->top);
      check_malloced_chunk(gm, mem, nb);
      goto postaction;
    }

    mem = sys_alloc(gm, nb);

  postaction:
    POSTACTION(gm);
    return mem;
  }

  return 0;
}

/* ---------------------------- free --------------------------- */

void dlfree(void* mem) {
  /*
     Consolidate freed chunks with preceeding or succeeding bordering
     free chunks, if they exist, and then place in a bin.  Intermixed
     with special cases for top, dv, mmapped chunks, and usage errors.
  */

  if (mem != 0) {
    mchunkptr p  = mem2chunk(mem);
#if FOOTERS
    mstate fm = get_mstate_for(p);
    if (!ok_magic(fm)) {
      USAGE_ERROR_ACTION(fm, p);
      return;
    }
#else /* FOOTERS */
#define fm gm
#endif /* FOOTERS */
    if (!PREACTION(fm)) {
      check_inuse_chunk(fm, p);
      if (RTCHECK(ok_address(fm, p) && ok_inuse(p))) {
        size_t psize = chunksize(p);
        mchunkptr next = chunk_plus_offset(p, psize);
        if (!pinuse(p)) {
          size_t prevsize = p->prev_foot;
          if (is_mmapped(p)) {
            psize += prevsize + MMAP_FOOT_PAD;
            if (CALL_MUNMAP((char*)p - prevsize, psize) == 0)
              fm->footprint -= psize;
            goto postaction;
          }
          else {
            mchunkptr prev = chunk_minus_offset(p, prevsize);
            psize += prevsize;
            p = prev;
            if (RTCHECK(ok_address(fm, prev))) { /* consolidate backward */
              if (p != fm->dv) {
                unlink_chunk(fm, p, prevsize);
              }
              else if ((next->head & INUSE_BITS) == INUSE_BITS) {
                fm->dvsize = psize;
                set_free_with_pinuse(p, psize, next);
                goto postaction;
              }
            }
            else
              goto erroraction;
          }
        }

        if (RTCHECK(ok_next(p, next) && ok_pinuse(next))) {
          if (!cinuse(next)) {  /* consolidate forward */
            if (next == fm->top) {
              size_t tsize = fm->topsize += psize;
              fm->top = p;
              p->head = tsize | PINUSE_BIT;
              if (p == fm->dv) {
                fm->dv = 0;
                fm->dvsize = 0;
              }
              if (should_trim(fm, tsize))
                sys_trim(fm, 0);
              goto postaction;
            }
            else if (next == fm->dv) {
              size_t dsize = fm->dvsize += psize;
              fm->dv = p;
              set_size_and_pinuse_of_free_chunk(p, dsize);
              goto postaction;
            }
            else {
              size_t nsize = chunksize(next);
              psize += nsize;
              unlink_chunk(fm, next, nsize);
              set_size_and_pinuse_of_free_chunk(p, psize);
              if (p == fm->dv) {
                fm->dvsize = psize;
                goto postaction;
              }
            }
          }
          else
            set_free_with_pinuse(p, psize, next);

          if (is_small(psize)) {
            insert_small_chunk(fm, p, psize);
            check_free_chunk(fm, p);
          }
          else {
            tchunkptr tp = (tchunkptr)p;
            insert_large_chunk(fm, tp, psize);
            check_free_chunk(fm, p);
            if (--fm->release_checks == 0)
              release_unused_segments(fm);
          }
          goto postaction;
        }
      }
    erroraction:
      USAGE_ERROR_ACTION(fm, p);
    postaction:
      POSTACTION(fm);
    }
  }
#if !FOOTERS
#undef fm
#endif /* FOOTERS */
}

void* dlcalloc(size_t n_elements, size_t elem_size) {
  void* mem;
  size_t req = 0;
  if (n_elements != 0) {
    req = n_elements * elem_size;
    if (((n_elements | elem_size) & ~(size_t)0xffff) &&
        (req / n_elements != elem_size))
      req = MAX_SIZE_T; /* force downstream failure on overflow */
  }
  mem = dlmalloc(req);
  if (mem != 0 && calloc_must_clear(mem2chunk(mem)))
    memset(mem, 0, req);
  return mem;
}

#endif /* !ONLY_MSPACES */

/* ------------ Internal support for realloc, memalign, etc -------------- */

/* Try to realloc; only in-place unless can_move true */
static mchunkptr try_realloc_chunk(mstate m, mchunkptr p, size_t nb,
                                   int can_move) {
  mchunkptr newp = 0;
  size_t oldsize = chunksize(p);
  mchunkptr next = chunk_plus_offset(p, oldsize);
  if (RTCHECK(ok_address(m, p) && ok_inuse(p) &&
              ok_next(p, next) && ok_pinuse(next))) {
    if (is_mmapped(p)) {
      newp = mmap_resize(m, p, nb, can_move);
    }
    else if (oldsize >= nb) {             /* already big enough */
      size_t rsize = oldsize - nb;
      if (rsize >= MIN_CHUNK_SIZE) {      /* split off remainder */
        mchunkptr r = chunk_plus_offset(p, nb);
        set_inuse(m, p, nb);
        set_inuse(m, r, rsize);
        dispose_chunk(m, r, rsize);
      }
      newp = p;
    }
    else if (next == m->top) {  /* extend into top */
      if (oldsize + m->topsize > nb) {
        size_t newsize = oldsize + m->topsize;
        size_t newtopsize = newsize - nb;
        mchunkptr newtop = chunk_plus_offset(p, nb);
        set_inuse(m, p, nb);
        newtop->head = newtopsize |PINUSE_BIT;
        m->top = newtop;
        m->topsize = newtopsize;
        newp = p;
      }
    }
    else if (next == m->dv) { /* extend into dv */
      size_t dvs = m->dvsize;
      if (oldsize + dvs >= nb) {
        size_t dsize = oldsize + dvs - nb;
        if (dsize >= MIN_CHUNK_SIZE) {
          mchunkptr r = chunk_plus_offset(p, nb);
          mchunkptr n = chunk_plus_offset(r, dsize);
          set_inuse(m, p, nb);
          set_size_and_pinuse_of_free_chunk(r, dsize);
          clear_pinuse(n);
          m->dvsize = dsize;
          m->dv = r;
        }
        else { /* exhaust dv */
          size_t newsize = oldsize + dvs;
          set_inuse(m, p, newsize);
          m->dvsize = 0;
          m->dv = 0;
        }
        newp = p;
      }
    }
    else if (!cinuse(next)) { /* extend into next free chunk */
      size_t nextsize = chunksize(next);
      if (oldsize + nextsize >= nb) {
        size_t rsize = oldsize + nextsize - nb;
        unlink_chunk(m, next, nextsize);
        if (rsize < MIN_CHUNK_SIZE) {
          size_t newsize = oldsize + nextsize;
          set_inuse(m, p, newsize);
        }
        else {
          mchunkptr r = chunk_plus_offset(p, nb);
          set_inuse(m, p, nb);
          set_inuse(m, r, rsize);
          dispose_chunk(m, r, rsize);
        }
        newp = p;
      }
    }
  }
  else {
    USAGE_ERROR_ACTION(m, chunk2mem(p));
  }
  return newp;
}

static void* internal_memalign(mstate m, size_t alignment, size_t bytes) {
  void* mem = 0;
  if (alignment <  MIN_CHUNK_SIZE) /* must be at least a minimum chunk size */
    alignment = MIN_CHUNK_SIZE;
  if ((alignment & (alignment-SIZE_T_ONE)) != 0) {/* Ensure a power of 2 */
    size_t a = MALLOC_ALIGNMENT << 1;
    while (a < alignment) a <<= 1;
    alignment = a;
  }
  if (bytes >= MAX_REQUEST - alignment) {
    if (m != 0)  { /* Test isn't needed but avoids compiler warning */
      MALLOC_FAILURE_ACTION;
    }
  }
  else {
    size_t nb = request2size(bytes);
    size_t req = nb + alignment + MIN_CHUNK_SIZE - CHUNK_OVERHEAD;
    mem = internal_malloc(m, req);
    if (mem != 0) {
      mchunkptr p = mem2chunk(mem);
      if (PREACTION(m))
        return 0;
      if ((((size_t)(mem)) & (alignment - 1)) != 0) { /* misaligned */
        /*
          Find an aligned spot inside chunk.  Since we need to give
          back leading space in a chunk of at least MIN_CHUNK_SIZE, if
          the first calculation places us at a spot with less than
          MIN_CHUNK_SIZE leader, we can move to the next aligned spot.
          We've allocated enough total room so that this is always
          possible.
        */
        char* br = (char*)mem2chunk((size_t)(((size_t)((char*)mem + alignment -
                                                       SIZE_T_ONE)) &
                                             -alignment));
        char* pos = ((size_t)(br - (char*)(p)) >= MIN_CHUNK_SIZE)?
          br : br+alignment;
        mchunkptr newp = (mchunkptr)pos;
        size_t leadsize = pos - (char*)(p);
        size_t newsize = chunksize(p) - leadsize;

        if (is_mmapped(p)) { /* For mmapped chunks, just adjust offset */
          newp->prev_foot = p->prev_foot + leadsize;
          newp->head = newsize;
        }
        else { /* Otherwise, give back leader, use the rest */
          set_inuse(m, newp, newsize);
          set_inuse(m, p, leadsize);
          dispose_chunk(m, p, leadsize);
        }
        p = newp;
      }

      /* Give back spare room at the end */
      if (!is_mmapped(p)) {
        size_t size = chunksize(p);
        if (size > nb + MIN_CHUNK_SIZE) {
          size_t remainder_size = size - nb;
          mchunkptr remainder = chunk_plus_offset(p, nb);
          set_inuse(m, p, nb);
          set_inuse(m, remainder, remainder_size);
          dispose_chunk(m, remainder, remainder_size);
        }
      }

      mem = chunk2mem(p);
      assert (chunksize(p) >= nb);
      assert(((size_t)mem & (alignment - 1)) == 0);
      check_inuse_chunk(m, p);
      POSTACTION(m);
    }
  }
  return mem;
}

/*
  Common support for independent_X routines, handling
    all of the combinations that can result.
  The opts arg has:
    bit 0 set if all elements are same size (using sizes[0])
    bit 1 set if elements should be zeroed
*/
static void** ialloc(mstate m,
                     size_t n_elements,
                     size_t* sizes,
                     int opts,
                     void* chunks[]) {

  size_t    element_size;   /* chunksize of each element, if all same */
  size_t    contents_size;  /* total size of elements */
  size_t    array_size;     /* request size of pointer array */
  void*     mem;            /* malloced aggregate space */
  mchunkptr p;              /* corresponding chunk */
  size_t    remainder_size; /* remaining bytes while splitting */
  void**    marray;         /* either "chunks" or malloced ptr array */
  mchunkptr array_chunk;    /* chunk for malloced ptr array */
  flag_t    was_enabled;    /* to disable mmap */
  size_t    size;
  size_t    i;

  ensure_initialization();
  /* compute array length, if needed */
  if (chunks != 0) {
    if (n_elements == 0)
      return chunks; /* nothing to do */
    marray = chunks;
    array_size = 0;
  }
  else {
    /* if empty req, must still return chunk representing empty array */
    if (n_elements == 0)
      return (void**)internal_malloc(m, 0);
    marray = 0;
    array_size = request2size(n_elements * (sizeof(void*)));
  }

  /* compute total element size */
  if (opts & 0x1) { /* all-same-size */
    element_size = request2size(*sizes);
    contents_size = n_elements * element_size;
  }
  else { /* add up all the sizes */
    element_size = 0;
    contents_size = 0;
    for (i = 0; i != n_elements; ++i)
      contents_size += request2size(sizes[i]);
  }

  size = contents_size + array_size;

  /*
     Allocate the aggregate chunk.  First disable direct-mmapping so
     malloc won't use it, since we would not be able to later
     free/realloc space internal to a segregated mmap region.
  */
  was_enabled = use_mmap(m);
  disable_mmap(m);
  mem = internal_malloc(m, size - CHUNK_OVERHEAD);
  if (was_enabled)
    enable_mmap(m);
  if (mem == 0)
    return 0;

  if (PREACTION(m)) return 0;
  p = mem2chunk(mem);
  remainder_size = chunksize(p);

  assert(!is_mmapped(p));

  if (opts & 0x2) {       /* optionally clear the elements */
    memset((size_t*)mem, 0, remainder_size - SIZE_T_SIZE - array_size);
  }

  /* If not provided, allocate the pointer array as final part of chunk */
  if (marray == 0) {
    size_t  array_chunk_size;
    array_chunk = chunk_plus_offset(p, contents_size);
    array_chunk_size = remainder_size - contents_size;
    marray = (void**) (chunk2mem(array_chunk));
    set_size_and_pinuse_of_inuse_chunk(m, array_chunk, array_chunk_size);
    remainder_size = contents_size;
  }

  /* split out elements */
  for (i = 0; ; ++i) {
    marray[i] = chunk2mem(p);
    if (i != n_elements-1) {
      if (element_size != 0)
        size = element_size;
      else
        size = request2size(sizes[i]);
      remainder_size -= size;
      set_size_and_pinuse_of_inuse_chunk(m, p, size);
      p = chunk_plus_offset(p, size);
    }
    else { /* the final element absorbs any overallocation slop */
      set_size_and_pinuse_of_inuse_chunk(m, p, remainder_size);
      break;
    }
  }

#if DEBUG
  if (marray != chunks) {
    /* final element must have exactly exhausted chunk */
    if (element_size != 0) {
      assert(remainder_size == element_size);
    }
    else {
      assert(remainder_size == request2size(sizes[i]));
    }
    check_inuse_chunk(m, mem2chunk(marray));
  }
  for (i = 0; i != n_elements; ++i)
    check_inuse_chunk(m, mem2chunk(marray[i]));

#endif /* DEBUG */

  POSTACTION(m);
  return marray;
}

/* Try to free all pointers in the given array.
   Note: this could be made faster, by delaying consolidation,
   at the price of disabling some user integrity checks, We
   still optimize some consolidations by combining adjacent
   chunks before freeing, which will occur often if allocated
   with ialloc or the array is sorted.
*/
static size_t internal_bulk_free(mstate m, void* array[], size_t nelem) {
  size_t unfreed = 0;
  if (!PREACTION(m)) {
    persist<void*>* a;
    persist<void*>* fence = (persist<void*>*)&(array[nelem]);
    for (a = (persist<void*>*)array; a != fence; ++a) {
      void* mem = *a;
      if (mem != 0) {
        mchunkptr p = mem2chunk(mem);
        size_t psize = chunksize(p);
#if FOOTERS
        if (get_mstate_for(p) != m) {
          ++unfreed;
          continue;
        }
#endif
        check_inuse_chunk(m, p);
        *a = 0;
        if (RTCHECK(ok_address(m, p) && ok_inuse(p))) {
          persist<void *>* b = a + 1; /* try to merge with next chunk */
          mchunkptr next = next_chunk(p);
          if (b != fence && *b == chunk2mem(next)) {
            size_t newsize = chunksize(next) + psize;
            set_inuse(m, p, newsize);
            *b = chunk2mem(p);
          }
          else
            dispose_chunk(m, p, psize);
        }
        else {
          CORRUPTION_ERROR_ACTION(m);
          break;
        }
      }
    }
    if (should_trim(m, m->topsize))
      sys_trim(m, 0);
    POSTACTION(m);
  }
  return unfreed;
}

/* Traversal */
#if MALLOC_INSPECT_ALL
static void internal_inspect_all(mstate m,
                                 void(*handler)(void *start,
                                                void *end,
                                                size_t used_bytes,
                                                void* callback_arg),
                                 void* arg) {
  if (is_initialized(m)) {
    mchunkptr top = m->top;
    msegmentptr s;
    for (s = &m->seg; s != 0; s = s->next) {
      mchunkptr q = align_as_chunk(s->base);
      while (segment_holds(s, q) && q->head != FENCEPOST_HEAD) {
        mchunkptr next = next_chunk(q);
        size_t sz = chunksize(q);
        size_t used;
        void* start;
        if (is_inuse(q)) {
          used = sz - CHUNK_OVERHEAD; /* must not be mmapped */
          start = chunk2mem(q);
        }
        else {
          used = 0;
          if (is_small(sz)) {     /* offset by possible bookkeeping */
            start = (void*)((char*)q + sizeof(struct malloc_chunk));
          }
          else {
            start = (void*)((char*)q + sizeof(struct malloc_tree_chunk));
          }
        }
        if (start < (void*)next)  /* skip if all space is bookkeeping */
          handler(start, next, used, arg);
        if (q == top)
          break;
        q = next;
      }
    }
  }
}
#endif /* MALLOC_INSPECT_ALL */

/* ------------------ Exported realloc, memalign, etc -------------------- */

#if !ONLY_MSPACES

void* dlrealloc(void* oldmem, size_t bytes) {
  void* mem = 0;
  if (oldmem == 0) {
    mem = dlmalloc(bytes);
  }
  else if (bytes >= MAX_REQUEST) {
    MALLOC_FAILURE_ACTION;
  }
#ifdef REALLOC_ZERO_BYTES_FREES
  else if (bytes == 0) {
    dlfree(oldmem);
  }
#endif /* REALLOC_ZERO_BYTES_FREES */
  else {
    size_t nb = request2size(bytes);
    mchunkptr oldp = mem2chunk(oldmem);
#if ! FOOTERS
    mstate m = gm;
#else /* FOOTERS */
    mstate m = get_mstate_for(oldp);
    if (!ok_magic(m)) {
      USAGE_ERROR_ACTION(m, oldmem);
      return 0;
    }
#endif /* FOOTERS */
    if (!PREACTION(m)) {
      mchunkptr newp = try_realloc_chunk(m, oldp, nb, 1);
      POSTACTION(m);
      if (newp != 0) {
        check_inuse_chunk(m, newp);
        mem = chunk2mem(newp);
      }
      else {
        mem = internal_malloc(m, bytes);
        if (mem != 0) {
          size_t oc = chunksize(oldp) - overhead_for(oldp);
          memcpy(mem, oldmem, (oc < bytes)? oc : bytes);
          internal_free(m, oldmem);
        }
      }
    }
  }
  return mem;
}

void* dlrealloc_in_place(void* oldmem, size_t bytes) {
  void* mem = 0;
  if (oldmem != 0) {
    if (bytes >= MAX_REQUEST) {
      MALLOC_FAILURE_ACTION;
    }
    else {
      size_t nb = request2size(bytes);
      mchunkptr oldp = mem2chunk(oldmem);
#if ! FOOTERS
      mstate m = gm;
#else /* FOOTERS */
      mstate m = get_mstate_for(oldp);
      if (!ok_magic(m)) {
        USAGE_ERROR_ACTION(m, oldmem);
        return 0;
      }
#endif /* FOOTERS */
      if (!PREACTION(m)) {
        mchunkptr newp = try_realloc_chunk(m, oldp, nb, 0);
        POSTACTION(m);
        if (newp == oldp) {
          check_inuse_chunk(m, newp);
          mem = oldmem;
        }
      }
    }
  }
  return mem;
}

void* dlmemalign(size_t alignment, size_t bytes) {
  if (alignment <= MALLOC_ALIGNMENT) {
    return dlmalloc(bytes);
  }
  return internal_memalign(gm, alignment, bytes);
}

int dlposix_memalign(void** pp, size_t alignment, size_t bytes) {
  void* mem = 0;
  if (alignment == MALLOC_ALIGNMENT)
    mem = dlmalloc(bytes);
  else {
    size_t d = alignment / sizeof(void*);
    size_t r = alignment % sizeof(void*);
    if (r != 0 || d == 0 || (d & (d-SIZE_T_ONE)) != 0)
      return EINVAL;
    else if (bytes <= MAX_REQUEST - alignment) {
      if (alignment <  MIN_CHUNK_SIZE)
        alignment = MIN_CHUNK_SIZE;
      mem = internal_memalign(gm, alignment, bytes);
    }
  }
  if (mem == 0)
    return ENOMEM;
  else {
    *pp = mem;
    return 0;
  }
}

void* dlvalloc(size_t bytes) {
  size_t pagesz;
  ensure_initialization();
  pagesz = mparams.page_size;
  return dlmemalign(pagesz, bytes);
}

void* dlpvalloc(size_t bytes) {
  size_t pagesz;
  ensure_initialization();
  pagesz = mparams.page_size;
  return dlmemalign(pagesz, (bytes + pagesz - SIZE_T_ONE) & ~(pagesz - SIZE_T_ONE));
}

void** dlindependent_calloc(size_t n_elements, size_t elem_size,
                            void* chunks[]) {
  size_t sz = elem_size; /* serves as 1-element array */
  return ialloc(gm, n_elements, &sz, 3, chunks);
}

void** dlindependent_comalloc(size_t n_elements, size_t sizes[],
                              void* chunks[]) {
  return ialloc(gm, n_elements, sizes, 0, chunks);
}

size_t dlbulk_free(void* array[], size_t nelem) {
  return internal_bulk_free(gm, array, nelem);
}

#if MALLOC_INSPECT_ALL
void dlmalloc_inspect_all(void(*handler)(void *start,
                                         void *end,
                                         size_t used_bytes,
                                         void* callback_arg),
                          void* arg) {
  ensure_initialization();
  if (!PREACTION(gm)) {
    internal_inspect_all(gm, handler, arg);
    POSTACTION(gm);
  }
}
#endif /* MALLOC_INSPECT_ALL */

int dlmalloc_trim(size_t pad) {
  int result = 0;
  ensure_initialization();
  if (!PREACTION(gm)) {
    result = sys_trim(gm, pad);
    POSTACTION(gm);
  }
  return result;
}

size_t dlmalloc_footprint(void) {
  return gm->footprint;
}

size_t dlmalloc_max_footprint(void) {
  return gm->max_footprint;
}

size_t dlmalloc_footprint_limit(void) {
  size_t maf = gm->footprint_limit;
  return maf == 0 ? MAX_SIZE_T : maf;
}

size_t dlmalloc_set_footprint_limit(size_t bytes) {
  size_t result;  /* invert sense of 0 */
  if (bytes == 0)
    result = granularity_align(1); /* Use minimal size */
  if (bytes == MAX_SIZE_T)
    result = 0;                    /* disable */
  else
    result = granularity_align(bytes);
  return gm->footprint_limit = result;
}

#if !NO_MALLINFO
struct mallinfo dlmallinfo(void) {
  return internal_mallinfo(gm);
}
#endif /* NO_MALLINFO */

#if !NO_MALLOC_STATS
void dlmalloc_stats() {
  internal_malloc_stats(gm);
}
#endif /* NO_MALLOC_STATS */

int dlmallopt(int param_number, int value) {
  return change_mparam(param_number, value);
}

size_t dlmalloc_usable_size(void* mem) {
  if (mem != 0) {
    mchunkptr p = mem2chunk(mem);
    if (is_inuse(p))
      return chunksize(p) - overhead_for(p);
  }
  return 0;
}

#endif /* !ONLY_MSPACES */

/* ----------------------------- user mspaces ---------------------------- */

#if MSPACES

static mstate init_user_mstate(char* tbase, size_t tsize) {
  size_t msize = pad_request(sizeof(struct malloc_state));
  mchunkptr mn;
  mchunkptr msp = align_as_chunk(tbase);
  mstate m = (mstate)(chunk2mem(msp));
  memset(m, 0, msize);
  (void)INITIAL_LOCK(&m->mutex);
  msp->head = (msize|INUSE_BITS);
  m->seg.base = m->least_addr = tbase;
  m->seg.size = m->footprint = m->max_footprint = tsize;
  m->magic = mparams.magic;
  m->release_checks = MAX_RELEASE_CHECK_RATE;
  m->mflags = mparams.default_mflags;
  m->extp = 0;
  m->exts = 0;
  disable_contiguous(m);
  init_bins(m);
  mn = next_chunk(mem2chunk(m));
  init_top(m, mn, (size_t)((tbase + tsize) - (char*)mn) - TOP_FOOT_SIZE);
  check_top_chunk(m, m->top);
  return m;
}

mspace create_mspace(size_t capacity, int locked) {
  mstate m = 0;
  size_t msize;
  ensure_initialization();
  msize = pad_request(sizeof(struct malloc_state));
  if (capacity < (size_t) -(msize + TOP_FOOT_SIZE + mparams.page_size)) {
    size_t rs = ((capacity == 0)? mparams.granularity :
                 (capacity + TOP_FOOT_SIZE + msize));
    size_t tsize = granularity_align(rs);
    char* tbase = (char*)(CALL_MMAP(tsize));
    if (tbase != CMFAIL) {
      m = init_user_mstate(tbase, tsize);
      m->seg.sflags = USE_MMAP_BIT;
      set_lock(m, locked);
    }
  }
  return (mspace)m;
}

mspace create_mspace_with_base(void* base, size_t capacity, int locked) {
  mstate m = 0;
  size_t msize;
  ensure_initialization();
  msize = pad_request(sizeof(struct malloc_state));
  if (capacity > msize + TOP_FOOT_SIZE &&
      capacity < (size_t) -(msize + TOP_FOOT_SIZE + mparams.page_size)) {
    m = init_user_mstate((char*)base, capacity);
    m->seg.sflags = EXTERN_BIT;
    set_lock(m, locked);
  }
  return (mspace)m;
}

int mspace_track_large_chunks(mspace msp, int enable) {
  int ret = 0;
  mstate ms = (mstate)msp;
  if (!PREACTION(ms)) {
    if (!use_mmap(ms)) {
      ret = 1;
    }
    if (!enable) {
      enable_mmap(ms);
    } else {
      disable_mmap(ms);
    }
    POSTACTION(ms);
  }
  return ret;
}

size_t destroy_mspace(mspace msp) {
  size_t freed = 0;
  mstate ms = (mstate)msp;
  if (ok_magic(ms)) {
    msegmentptr sp = &ms->seg;
    (void)DESTROY_LOCK(&ms->mutex); /* destroy before unmapped */
    while (sp != 0) {
      char* base = sp->base;
      size_t size = sp->size;
      flag_t flag = sp->sflags;
      (void)base; /* placate people compiling -Wunused-variable */
      sp = sp->next;
      if ((flag & USE_MMAP_BIT) && !(flag & EXTERN_BIT) &&
          CALL_MUNMAP(base, size) == 0)
        freed += size;
    }
  }
  else {
    USAGE_ERROR_ACTION(ms,ms);
  }
  return freed;
}

/*
  mspace versions of routines are near-clones of the global
  versions. This is not so nice but better than the alternatives.
*/

void* mspace_malloc(mspace msp, size_t bytes) {
  mstate ms = (mstate)msp;
  if (!ok_magic(ms)) {
    USAGE_ERROR_ACTION(ms,ms);
    return 0;
  }
  if (!PREACTION(ms)) {
    void* mem;
    size_t nb;
    if (bytes <= MAX_SMALL_REQUEST) {
      bindex_t idx;
      binmap_t smallbits;
      nb = (bytes < MIN_REQUEST)? MIN_CHUNK_SIZE : pad_request(bytes);
      idx = small_index(nb);
      smallbits = ms->smallmap >> idx;

      if ((smallbits & 0x3U) != 0) { /* Remainderless fit to a smallbin. */
        mchunkptr b, p;
        idx += ~smallbits & 1;       /* Uses next bin if idx empty */
        b = smallbin_at(ms, idx);
        p = b->fd;
        assert(chunksize(p) == small_index2size(idx));
        unlink_first_small_chunk(ms, b, p, idx);
        set_inuse_and_pinuse(ms, p, small_index2size(idx));
        mem = chunk2mem(p);
        check_malloced_chunk(ms, mem, nb);
        goto postaction;
      }

      else if (nb > ms->dvsize) {
        if (smallbits != 0) { /* Use chunk in next nonempty smallbin */
          mchunkptr b, p, r;
          size_t rsize;
          bindex_t i;
          binmap_t leftbits = (smallbits << idx) & left_bits(idx2bit(idx));
          binmap_t leastbit = least_bit(leftbits);
          compute_bit2idx(leastbit, i);
          b = smallbin_at(ms, i);
          p = b->fd;
          assert(chunksize(p) == small_index2size(i));
          unlink_first_small_chunk(ms, b, p, i);
          rsize = small_index2size(i) - nb;
          /* Fit here cannot be remainderless if 4byte sizes */
          if (SIZE_T_SIZE != 4 && rsize < MIN_CHUNK_SIZE)
            set_inuse_and_pinuse(ms, p, small_index2size(i));
          else {
            set_size_and_pinuse_of_inuse_chunk(ms, p, nb);
            r = chunk_plus_offset(p, nb);
            set_size_and_pinuse_of_free_chunk(r, rsize);
            replace_dv(ms, r, rsize);
          }
          mem = chunk2mem(p);
          check_malloced_chunk(ms, mem, nb);
          goto postaction;
        }

        else if (ms->treemap.pload() != 0 && (mem = tmalloc_small(ms, nb)) != 0) {
          check_malloced_chunk(ms, mem, nb);
          goto postaction;
        }
      }
    }
    else if (bytes >= MAX_REQUEST)
      nb = MAX_SIZE_T; /* Too big to allocate. Force failure (in sys alloc) */
    else {
      nb = pad_request(bytes);
      if (ms->treemap.pload() != 0 && (mem = tmalloc_large(ms, nb)) != 0) {
        check_malloced_chunk(ms, mem, nb);
        goto postaction;
      }
    }

    if (nb <= ms->dvsize) {
      size_t rsize = ms->dvsize - nb;
      mchunkptr p = ms->dv;
      if (rsize >= MIN_CHUNK_SIZE) { /* split dv */
        mchunkptr r = ms->dv = chunk_plus_offset(p, nb);
        ms->dvsize = rsize;
        set_size_and_pinuse_of_free_chunk(r, rsize);
        set_size_and_pinuse_of_inuse_chunk(ms, p, nb);
      }
      else { /* exhaust dv */
        size_t dvs = ms->dvsize;
        ms->dvsize = 0;
        ms->dv = 0;
        set_inuse_and_pinuse(ms, p, dvs);
      }
      mem = chunk2mem(p);
      check_malloced_chunk(ms, mem, nb);
      goto postaction;
    }

    else if (nb < ms->topsize) { /* Split top */
      size_t rsize = ms->topsize -= nb;
      mchunkptr p = ms->top;
      mchunkptr r = ms->top = chunk_plus_offset(p, nb);
      r->head = rsize | PINUSE_BIT;
      set_size_and_pinuse_of_inuse_chunk(ms, p, nb);
      mem = chunk2mem(p);
      check_top_chunk(ms, ms->top);
      check_malloced_chunk(ms, mem, nb);
      goto postaction;
    }
    mem = 0;
    //mem = sys_alloc(ms, nb);

  postaction:
    POSTACTION(ms);
    return mem;
  }

  return 0;
}

void mspace_free(mspace msp, void* mem) {
  if (mem != 0) {
    mchunkptr p  = mem2chunk(mem);
#if FOOTERS
    mstate fm = get_mstate_for(p);
    (void)msp; /* placate people compiling -Wunused */
#else /* FOOTERS */
    mstate fm = (mstate)msp;
#endif /* FOOTERS */
    if (!ok_magic(fm)) {
      USAGE_ERROR_ACTION(fm, p);
      return;
    }
    if (!PREACTION(fm)) {
      check_inuse_chunk(fm, p);
      if (RTCHECK(ok_address(fm, p) && ok_inuse(p))) {
        size_t psize = chunksize(p);
        mchunkptr next = chunk_plus_offset(p, psize);
        if (!pinuse(p)) {
          size_t prevsize = p->prev_foot;
          if (is_mmapped(p)) {
            psize += prevsize + MMAP_FOOT_PAD;
            if (CALL_MUNMAP((char*)p - prevsize, psize) == 0)
              fm->footprint -= psize;
            goto postaction;
          }
          else {
            mchunkptr prev = chunk_minus_offset(p, prevsize);
            psize += prevsize;
            p = prev;
            if (RTCHECK(ok_address(fm, prev))) { /* consolidate backward */
              if (p != fm->dv) {
                unlink_chunk(fm, p, prevsize);
              }
              else if ((next->head & INUSE_BITS) == INUSE_BITS) {
                fm->dvsize = psize;
                set_free_with_pinuse(p, psize, next);
                goto postaction;
              }
            }
            else
              goto erroraction;
          }
        }

        if (RTCHECK(ok_next(p, next) && ok_pinuse(next))) {
          if (!cinuse(next)) {  /* consolidate forward */
            if (next == fm->top) {
              size_t tsize = fm->topsize += psize;
              fm->top = p;
              p->head = tsize | PINUSE_BIT;
              if (p == fm->dv) {
                fm->dv = 0;
                fm->dvsize = 0;
              }
              if (should_trim(fm, tsize))
                sys_trim(fm, 0);
              goto postaction;
            }
            else if (next == fm->dv) {
              size_t dsize = fm->dvsize += psize;
              fm->dv = p;
              set_size_and_pinuse_of_free_chunk(p, dsize);
              goto postaction;
            }
            else {
              size_t nsize = chunksize(next);
              psize += nsize;
              unlink_chunk(fm, next, nsize);
              set_size_and_pinuse_of_free_chunk(p, psize);
              if (p == fm->dv) {
                fm->dvsize = psize;
                goto postaction;
              }
            }
          }
          else
            set_free_with_pinuse(p, psize, next);

          if (is_small(psize)) {
            insert_small_chunk(fm, p, psize);
            check_free_chunk(fm, p);
          }
          else {
            tchunkptr tp = (tchunkptr)p;
            insert_large_chunk(fm, tp, psize);
            check_free_chunk(fm, p);
            fm->release_checks.pstore(fm->release_checks.pload()-1);
            if (fm->release_checks.pload()== 0)
              release_unused_segments(fm);
          }
          goto postaction;
        }
      }
    erroraction:
      USAGE_ERROR_ACTION(fm, p);
    postaction:
      POSTACTION(fm);
    }
  }
}

void* mspace_calloc(mspace msp, size_t n_elements, size_t elem_size) {
  void* mem;
  size_t req = 0;
  mstate ms = (mstate)msp;
  if (!ok_magic(ms)) {
    USAGE_ERROR_ACTION(ms,ms);
    return 0;
  }
  if (n_elements != 0) {
    req = n_elements * elem_size;
    if (((n_elements | elem_size) & ~(size_t)0xffff) &&
        (req / n_elements != elem_size))
      req = MAX_SIZE_T; /* force downstream failure on overflow */
  }
  mem = internal_malloc(ms, req);
  if (mem != 0 && calloc_must_clear(mem2chunk(mem)))
    memset(mem, 0, req);
  return mem;
}

void* mspace_realloc(mspace msp, void* oldmem, size_t bytes) {
  void* mem = 0;
  if (oldmem == 0) {
    mem = mspace_malloc(msp, bytes);
  }
  else if (bytes >= MAX_REQUEST) {
    MALLOC_FAILURE_ACTION;
  }
#ifdef REALLOC_ZERO_BYTES_FREES
  else if (bytes == 0) {
    mspace_free(msp, oldmem);
  }
#endif /* REALLOC_ZERO_BYTES_FREES */
  else {
    size_t nb = request2size(bytes);
    mchunkptr oldp = mem2chunk(oldmem);
#if ! FOOTERS
    mstate m = (mstate)msp;
#else /* FOOTERS */
    mstate m = get_mstate_for(oldp);
    if (!ok_magic(m)) {
      USAGE_ERROR_ACTION(m, oldmem);
      return 0;
    }
#endif /* FOOTERS */
    if (!PREACTION(m)) {
      mchunkptr newp = try_realloc_chunk(m, oldp, nb, 1);
      POSTACTION(m);
      if (newp != 0) {
        check_inuse_chunk(m, newp);
        mem = chunk2mem(newp);
      }
      else {
        mem = mspace_malloc(m, bytes);
        if (mem != 0) {
          size_t oc = chunksize(oldp) - overhead_for(oldp);
          memcpy(mem, oldmem, (oc < bytes)? oc : bytes);
          mspace_free(m, oldmem);
        }
      }
    }
  }
  return mem;
}

void* mspace_realloc_in_place(mspace msp, void* oldmem, size_t bytes) {
  void* mem = 0;
  if (oldmem != 0) {
    if (bytes >= MAX_REQUEST) {
      MALLOC_FAILURE_ACTION;
    }
    else {
      size_t nb = request2size(bytes);
      mchunkptr oldp = mem2chunk(oldmem);
#if ! FOOTERS
      mstate m = (mstate)msp;
#else /* FOOTERS */
      mstate m = get_mstate_for(oldp);
      (void)msp; /* placate people compiling -Wunused */
      if (!ok_magic(m)) {
        USAGE_ERROR_ACTION(m, oldmem);
        return 0;
      }
#endif /* FOOTERS */
      if (!PREACTION(m)) {
        mchunkptr newp = try_realloc_chunk(m, oldp, nb, 0);
        POSTACTION(m);
        if (newp == oldp) {
          check_inuse_chunk(m, newp);
          mem = oldmem;
        }
      }
    }
  }
  return mem;
}

void* mspace_memalign(mspace msp, size_t alignment, size_t bytes) {
  mstate ms = (mstate)msp;
  if (!ok_magic(ms)) {
    USAGE_ERROR_ACTION(ms,ms);
    return 0;
  }
  if (alignment <= MALLOC_ALIGNMENT)
    return mspace_malloc(msp, bytes);
  return internal_memalign(ms, alignment, bytes);
}

void** mspace_independent_calloc(mspace msp, size_t n_elements,
                                 size_t elem_size, void* chunks[]) {
  size_t sz = elem_size; /* serves as 1-element array */
  mstate ms = (mstate)msp;
  if (!ok_magic(ms)) {
    USAGE_ERROR_ACTION(ms,ms);
    return 0;
  }
  return ialloc(ms, n_elements, &sz, 3, chunks);
}

void** mspace_independent_comalloc(mspace msp, size_t n_elements,
                                   size_t sizes[], void* chunks[]) {
  mstate ms = (mstate)msp;
  if (!ok_magic(ms)) {
    USAGE_ERROR_ACTION(ms,ms);
    return 0;
  }
  return ialloc(ms, n_elements, sizes, 0, chunks);
}

size_t mspace_bulk_free(mspace msp, void* array[], size_t nelem) {
  return internal_bulk_free((mstate)msp, array, nelem);
}

#if MALLOC_INSPECT_ALL
void mspace_inspect_all(mspace msp,
                        void(*handler)(void *start,
                                       void *end,
                                       size_t used_bytes,
                                       void* callback_arg),
                        void* arg) {
  mstate ms = (mstate)msp;
  if (ok_magic(ms)) {
    if (!PREACTION(ms)) {
      internal_inspect_all(ms, handler, arg);
      POSTACTION(ms);
    }
  }
  else {
    USAGE_ERROR_ACTION(ms,ms);
  }
}
#endif /* MALLOC_INSPECT_ALL */

int mspace_trim(mspace msp, size_t pad) {
  int result = 0;
  mstate ms = (mstate)msp;
  if (ok_magic(ms)) {
    if (!PREACTION(ms)) {
      result = sys_trim(ms, pad);
      POSTACTION(ms);
    }
  }
  else {
    USAGE_ERROR_ACTION(ms,ms);
  }
  return result;
}

#if !NO_MALLOC_STATS
void mspace_malloc_stats(mspace msp) {
  mstate ms = (mstate)msp;
  if (ok_magic(ms)) {
    internal_malloc_stats(ms);
  }
  else {
    USAGE_ERROR_ACTION(ms,ms);
  }
}
#endif /* NO_MALLOC_STATS */

size_t mspace_footprint(mspace msp) {
  size_t result = 0;
  mstate ms = (mstate)msp;
  if (ok_magic(ms)) {
    result = ms->footprint;
  }
  else {
    USAGE_ERROR_ACTION(ms,ms);
  }
  return result;
}

size_t mspace_max_footprint(mspace msp) {
  size_t result = 0;
  mstate ms = (mstate)msp;
  if (ok_magic(ms)) {
    result = ms->max_footprint;
  }
  else {
    USAGE_ERROR_ACTION(ms,ms);
  }
  return result;
}

size_t mspace_footprint_limit(mspace msp) {
  size_t result = 0;
  mstate ms = (mstate)msp;
  if (ok_magic(ms)) {
    size_t maf = ms->footprint_limit;
    result = (maf == 0) ? MAX_SIZE_T : maf;
  }
  else {
    USAGE_ERROR_ACTION(ms,ms);
  }
  return result;
}

size_t mspace_set_footprint_limit(mspace msp, size_t bytes) {
  size_t result = 0;
  mstate ms = (mstate)msp;
  if (ok_magic(ms)) {
    if (bytes == 0)
      result = granularity_align(1); /* Use minimal size */
    if (bytes == MAX_SIZE_T)
      result = 0;                    /* disable */
    else
      result = granularity_align(bytes);
    ms->footprint_limit = result;
  }
  else {
    USAGE_ERROR_ACTION(ms,ms);
  }
  return result;
}

#if !NO_MALLINFO
struct mallinfo mspace_mallinfo(mspace msp) {
  mstate ms = (mstate)msp;
  if (!ok_magic(ms)) {
    USAGE_ERROR_ACTION(ms,ms);
  }
  return internal_mallinfo(ms);
}
#endif /* NO_MALLINFO */

size_t mspace_usable_size(const void* mem) {
  if (mem != 0) {
    mchunkptr p = mem2chunk(mem);
    if (is_inuse(p))
      return chunksize(p) - overhead_for(p);
  }
  return 0;
}

int mspace_mallopt(int param_number, int value) {
  return change_mparam(param_number, value);
}

#endif /* MSPACES */


/* -------------------- Alternative MORECORE functions ------------------- */

/*
  Guidelines for creating a custom version of MORECORE:

  * For best performance, MORECORE should allocate in multiples of pagesize.
  * MORECORE may allocate more memory than requested. (Or even less,
      but this will usually result in a malloc failure.)
  * MORECORE must not allocate memory when given argument zero, but
      instead return one past the end address of memory from previous
      nonzero call.
  * For best performance, consecutive calls to MORECORE with positive
      arguments should return increasing addresses, indicating that
      space has been contiguously extended.
  * Even though consecutive calls to MORECORE need not return contiguous
      addresses, it must be OK for malloc'ed chunks to span multiple
      regions in those cases where they do happen to be contiguous.
  * MORECORE need not handle negative arguments -- it may instead
      just return MFAIL when given negative arguments.
      Negative arguments are always multiples of pagesize. MORECORE
      must not misinterpret negative args as large positive unsigned
      args. You can suppress all such calls from even occurring by defining
      MORECORE_CANNOT_TRIM,

  As an example alternative MORECORE, here is a custom allocator
  kindly contributed for pre-OSX macOS.  It uses virtually but not
  necessarily physically contiguous non-paged memory (locked in,
  present and won't get swapped out).  You can use it by uncommenting
  this section, adding some #includes, and setting up the appropriate
  defines above:

      #define MORECORE osMoreCore

  There is also a shutdown routine that should somehow be called for
  cleanup upon program exit.

  #define MAX_POOL_ENTRIES 100
  #define MINIMUM_MORECORE_SIZE  (64 * 1024U)
  static int next_os_pool;
  void *our_os_pools[MAX_POOL_ENTRIES];

  void *osMoreCore(int size)
  {
    void *ptr = 0;
    static void *sbrk_top = 0;

    if (size > 0)
    {
      if (size < MINIMUM_MORECORE_SIZE)
         size = MINIMUM_MORECORE_SIZE;
      if (CurrentExecutionLevel() == kTaskLevel)
         ptr = PoolAllocateResident(size + RM_PAGE_SIZE, 0);
      if (ptr == 0)
      {
        return (void *) MFAIL;
      }
      // save ptrs so they can be freed during cleanup
      our_os_pools[next_os_pool] = ptr;
      next_os_pool++;
      ptr = (void *) ((((size_t) ptr) + RM_PAGE_MASK) & ~RM_PAGE_MASK);
      sbrk_top = (char *) ptr + size;
      return ptr;
    }
    else if (size < 0)
    {
      // we don't currently support shrink behavior
      return (void *) MFAIL;
    }
    else
    {
      return sbrk_top;
    }
  }

  // cleanup any allocated memory pools
  // called as last thing before shutting down driver

  void osCleanupMem(void)
  {
    void **ptr;

    for (ptr = our_os_pools; ptr < &our_os_pools[MAX_POOL_ENTRIES]; ptr++)
      if (*ptr)
      {
         PoolDeallocate(*ptr);
         *ptr = 0;
      }
  }

*/


/* -----------------------------------------------------------------------
History:
    v2.8.6 Wed Aug 29 06:57:58 2012  Doug Lea
      * fix bad comparison in dlposix_memalign
      * don't reuse adjusted asize in sys_alloc
      * add LOCK_AT_FORK -- thanks to Kirill Artamonov for the suggestion
      * reduce compiler warnings -- thanks to all who reported/suggested these

    v2.8.5 Sun May 22 10:26:02 2011  Doug Lea  (dl at gee)
      * Always perform unlink checks unless INSECURE
      * Add posix_memalign.
      * Improve realloc to expand in more cases; expose realloc_in_place.
        Thanks to Peter Buhr for the suggestion.
      * Add footprint_limit, inspect_all, bulk_free. Thanks
        to Barry Hayes and others for the suggestions.
      * Internal refactorings to avoid calls while holding locks
      * Use non-reentrant locks by default. Thanks to Roland McGrath
        for the suggestion.
      * Small fixes to mspace_destroy, reset_on_error.
      * Various configuration extensions/changes. Thanks
         to all who contributed these.

    V2.8.4a Thu Apr 28 14:39:43 2011 (dl at gee.cs.oswego.edu)
      * Update Creative Commons URL

    V2.8.4 Wed May 27 09:56:23 2009  Doug Lea  (dl at gee)
      * Use zeros instead of prev foot for is_mmapped
      * Add mspace_track_large_chunks; thanks to Jean Brouwers
      * Fix set_inuse in internal_realloc; thanks to Jean Brouwers
      * Fix insufficient sys_alloc padding when using 16byte alignment
      * Fix bad error check in mspace_footprint
      * Adaptations for ptmalloc; thanks to Wolfram Gloger.
      * Reentrant spin locks; thanks to Earl Chew and others
      * Win32 improvements; thanks to Niall Douglas and Earl Chew
      * Add NO_SEGMENT_TRAVERSAL and MAX_RELEASE_CHECK_RATE options
      * Extension hook in malloc_state
      * Various small adjustments to reduce warnings on some compilers
      * Various configuration extensions/changes for more platforms. Thanks
         to all who contributed these.

    V2.8.3 Thu Sep 22 11:16:32 2005  Doug Lea  (dl at gee)
      * Add max_footprint functions
      * Ensure all appropriate literals are size_t
      * Fix conditional compilation problem for some #define settings
      * Avoid concatenating segments with the one provided
        in create_mspace_with_base
      * Rename some variables to avoid compiler shadowing warnings
      * Use explicit lock initialization.
      * Better handling of sbrk interference.
      * Simplify and fix segment insertion, trimming and mspace_destroy
      * Reinstate REALLOC_ZERO_BYTES_FREES option from 2.7.x
      * Thanks especially to Dennis Flanagan for help on these.

    V2.8.2 Sun Jun 12 16:01:10 2005  Doug Lea  (dl at gee)
      * Fix memalign brace error.

    V2.8.1 Wed Jun  8 16:11:46 2005  Doug Lea  (dl at gee)
      * Fix improper #endif nesting in C++
      * Add explicit casts needed for C++

    V2.8.0 Mon May 30 14:09:02 2005  Doug Lea  (dl at gee)
      * Use trees for large bins
      * Support mspaces
      * Use segments to unify sbrk-based and mmap-based system allocation,
        removing need for emulation on most platforms without sbrk.
      * Default safety checks
      * Optional footer checks. Thanks to William Robertson for the idea.
      * Internal code refactoring
      * Incorporate suggestions and platform-specific changes.
        Thanks to Dennis Flanagan, Colin Plumb, Niall Douglas,
        Aaron Bachmann,  Emery Berger, and others.
      * Speed up non-fastbin processing enough to remove fastbins.
      * Remove useless cfree() to avoid conflicts with other apps.
      * Remove internal memcpy, memset. Compilers handle builtins better.
      * Remove some options that no one ever used and rename others.

    V2.7.2 Sat Aug 17 09:07:30 2002  Doug Lea  (dl at gee)
      * Fix malloc_state bitmap array misdeclaration

    V2.7.1 Thu Jul 25 10:58:03 2002  Doug Lea  (dl at gee)
      * Allow tuning of FIRST_SORTED_BIN_SIZE
      * Use PTR_UINT as type for all ptr->int casts. Thanks to John Belmonte.
      * Better detection and support for non-contiguousness of MORECORE.
        Thanks to Andreas Mueller, Conal Walsh, and Wolfram Gloger
      * Bypass most of malloc if no frees. Thanks To Emery Berger.
      * Fix freeing of old top non-contiguous chunk im sysmalloc.
      * Raised default trim and map thresholds to 256K.
      * Fix mmap-related #defines. Thanks to Lubos Lunak.
      * Fix copy macros; added LACKS_FCNTL_H. Thanks to Neal Walfield.
      * Branch-free bin calculation
      * Default trim and mmap thresholds now 256K.

    V2.7.0 Sun Mar 11 14:14:06 2001  Doug Lea  (dl at gee)
      * Introduce independent_comalloc and independent_calloc.
        Thanks to Michael Pachos for motivation and help.
      * Make optional .h file available
      * Allow > 2GB requests on 32bit systems.
      * new WIN32 sbrk, mmap, munmap, lock code from <Walter@GeNeSys-e.de>.
        Thanks also to Andreas Mueller <a.mueller at paradatec.de>,
        and Anonymous.
      * Allow override of MALLOC_ALIGNMENT (Thanks to Ruud Waij for
        helping test this.)
      * memalign: check alignment arg
      * realloc: don't try to shift chunks backwards, since this
        leads to  more fragmentation in some programs and doesn't
        seem to help in any others.
      * Collect all cases in malloc requiring system memory into sysmalloc
      * Use mmap as backup to sbrk
      * Place all internal state in malloc_state
      * Introduce fastbins (although similar to 2.5.1)
      * Many minor tunings and cosmetic improvements
      * Introduce USE_PUBLIC_MALLOC_WRAPPERS, USE_MALLOC_LOCK
      * Introduce MALLOC_FAILURE_ACTION, MORECORE_CONTIGUOUS
        Thanks to Tony E. Bennett <tbennett@nvidia.com> and others.
      * Include errno.h to support default failure action.

    V2.6.6 Sun Dec  5 07:42:19 1999  Doug Lea  (dl at gee)
      * return null for negative arguments
      * Added Several WIN32 cleanups from Martin C. Fong <mcfong at yahoo.com>
         * Add 'LACKS_SYS_PARAM_H' for those systems without 'sys/param.h'
          (e.g. WIN32 platforms)
         * Cleanup header file inclusion for WIN32 platforms
         * Cleanup code to avoid Microsoft Visual C++ compiler complaints
         * Add 'USE_DL_PREFIX' to quickly allow co-existence with existing
           memory allocation routines
         * Set 'malloc_getpagesize' for WIN32 platforms (needs more work)
         * Use 'assert' rather than 'ASSERT' in WIN32 code to conform to
           usage of 'assert' in non-WIN32 code
         * Improve WIN32 'sbrk()' emulation's 'findRegion()' routine to
           avoid infinite loop
      * Always call 'fREe()' rather than 'free()'

    V2.6.5 Wed Jun 17 15:57:31 1998  Doug Lea  (dl at gee)
      * Fixed ordering problem with boundary-stamping

    V2.6.3 Sun May 19 08:17:58 1996  Doug Lea  (dl at gee)
      * Added pvalloc, as recommended by H.J. Liu
      * Added 64bit pointer support mainly from Wolfram Gloger
      * Added anonymously donated WIN32 sbrk emulation
      * Malloc, calloc, getpagesize: add optimizations from Raymond Nijssen
      * malloc_extend_top: fix mask error that caused wastage after
        foreign sbrks
      * Add linux mremap support code from HJ Liu

    V2.6.2 Tue Dec  5 06:52:55 1995  Doug Lea  (dl at gee)
      * Integrated most documentation with the code.
      * Add support for mmap, with help from
        Wolfram Gloger (Gloger@lrz.uni-muenchen.de).
      * Use last_remainder in more cases.
      * Pack bins using idea from  colin@nyx10.cs.du.edu
      * Use ordered bins instead of best-fit threshhold
      * Eliminate block-local decls to simplify tracing and debugging.
      * Support another case of realloc via move into top
      * Fix error occuring when initial sbrk_base not word-aligned.
      * Rely on page size for units instead of SBRK_UNIT to
        avoid surprises about sbrk alignment conventions.
      * Add mallinfo, mallopt. Thanks to Raymond Nijssen
        (raymond@es.ele.tue.nl) for the suggestion.
      * Add `pad' argument to malloc_trim and top_pad mallopt parameter.
      * More precautions for cases where other routines call sbrk,
        courtesy of Wolfram Gloger (Gloger@lrz.uni-muenchen.de).
      * Added macros etc., allowing use in linux libc from
        H.J. Lu (hjl@gnu.ai.mit.edu)
      * Inverted this history list

    V2.6.1 Sat Dec  2 14:10:57 1995  Doug Lea  (dl at gee)
      * Re-tuned and fixed to behave more nicely with V2.6.0 changes.
      * Removed all preallocation code since under current scheme
        the work required to undo bad preallocations exceeds
        the work saved in good cases for most test programs.
      * No longer use return list or unconsolidated bins since
        no scheme using them consistently outperforms those that don't
        given above changes.
      * Use best fit for very large chunks to prevent some worst-cases.
      * Added some support for debugging

    V2.6.0 Sat Nov  4 07:05:23 1995  Doug Lea  (dl at gee)
      * Removed footers when chunks are in use. Thanks to
        Paul Wilson (wilson@cs.texas.edu) for the suggestion.

    V2.5.4 Wed Nov  1 07:54:51 1995  Doug Lea  (dl at gee)
      * Added malloc_trim, with help from Wolfram Gloger
        (wmglo@Dent.MED.Uni-Muenchen.DE).

    V2.5.3 Tue Apr 26 10:16:01 1994  Doug Lea  (dl at g)

    V2.5.2 Tue Apr  5 16:20:40 1994  Doug Lea  (dl at g)
      * realloc: try to expand in both directions
      * malloc: swap order of clean-bin strategy;
      * realloc: only conditionally expand backwards
      * Try not to scavenge used bins
      * Use bin counts as a guide to preallocation
      * Occasionally bin return list chunks in first scan
      * Add a few optimizations from colin@nyx10.cs.du.edu

    V2.5.1 Sat Aug 14 15:40:43 1993  Doug Lea  (dl at g)
      * faster bin computation & slightly different binning
      * merged all consolidations to one part of malloc proper
         (eliminating old malloc_find_space & malloc_clean_bin)
      * Scan 2 returns chunks (not just 1)
      * Propagate failure in realloc if malloc returns 0
      * Add stuff to allow compilation on non-ANSI compilers
          from kpv@research.att.com

    V2.5 Sat Aug  7 07:41:59 1993  Doug Lea  (dl at g.oswego.edu)
      * removed potential for odd address access in prev_chunk
      * removed dependency on getpagesize.h
      * misc cosmetics and a bit more internal documentation
      * anticosmetics: mangled names in macros to evade debugger strangeness
      * tested on sparc, hp-700, dec-mips, rs6000
          with gcc & native cc (hp, dec only) allowing
          Detlefs & Zorn comparison study (in SIGPLAN Notices.)

    Trial version Fri Aug 28 13:14:29 1992  Doug Lea  (dl at g.oswego.edu)
      * Based loosely on libg++-1.2X malloc. (It retains some of the overall
         structure of old version,  but most details differ.)

*/
}


================================================
FILE: ptms/rwlocks/CRWWP.hpp
================================================
/******************************************************************************
 * Copyright (c) 2014-2017, Pedro Ramalhete, Andreia Correia
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of Concurrency Freaks nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************************************************************
 */

#ifndef _CRWWP_H_
#define _CRWWP_H_

#include <atomic>
#include <stdexcept>
#include <cstdint>
#include <thread>


// Pause to prevent excess processor bus usage
#if defined( __sparc )
#define Pause() __asm__ __volatile__ ( "rd %ccr,%g0" )
#elif defined( __i386 ) || defined( __x86_64 )
#define Pause() __asm__ __volatile__ ( "pause" : : : )
#else
#define Pause() std::this_thread::yield();
#endif


/**
 * <h1> C-RW-WP </h1>
 *
 * A C-RW-WP reader-writer lock with writer preference and using a
 * Ticket Lock as Cohort.
 * This is starvation-free for writers and for readers, but readers may be
 * starved by writers.
 *
 * C-RW-WP paper:         http://dl.acm.org/citation.cfm?id=2442532
 *
 * <p>
 * @author Pedro Ramalhete
 * @author Andreia Correia
 */
class CRWWP {

private:
    class TicketLock {
        alignas(128) std::atomic<uint64_t> ticket {0};
        alignas(128) std::atomic<uint64_t> grant {0};
    public:
        bool isLocked() { return grant.load(std::memory_order_acquire) != ticket.load(std::memory_order_acquire); }
        void lock() {
            auto tkt = ticket.fetch_add(1);
            while (tkt != grant.load(std::memory_order_acquire)) Pause();
        }
        void unlock() {
            auto tkt = grant.load(std::memory_order_relaxed);
            grant.store(tkt+1, std::memory_order_release);
        }
    };

    class RIAtomicCounterArray {
    private:
        static const int MAX_THREADS = 64;
        static const int CLPAD = (128/sizeof(std::atomic<uint64_t>));
        static const int COUNTER_SIZE = 3*MAX_THREADS; // Alternatively, use std::thread::hardware_concurrency()
        std::hash<std::thread::id> hashFunc {};
        alignas(128) std::atomic<uint64_t> counters[COUNTER_SIZE*CLPAD] ;
    public:
        RIAtomicCounterArray() {
            for (int i=0; i < COUNTER_SIZE; i++) {
                counters[i*CLPAD].store(0, std::memory_order_relaxed);
            }
        }
        void arrive(const int notused=0) noexcept {
            const uint64_t tid = hashFunc(std::this_thread::get_id());
            const int icounter = (int)(tid % COUNTER_SIZE);
            counters[icounter*CLPAD].fetch_add(1);
        }
        void depart(const int notused=0) noexcept {
            const uint64_t tid = hashFunc(std::this_thread::get_id());
            const int icounter = (int)(tid % COUNTER_SIZE);
            counters[icounter*CLPAD].fetch_add(-1);
        }
        bool isEmpty(void) noexcept {
            for (int i = 0; i < COUNTER_SIZE; i++) {
                if (counters[i*CLPAD].load(std::memory_order_acquire) > 0) return false;
            }
            return true;
        }
    };

    static const int MAX_THREADS = 128;
    //static const int LOCKED = 1;
    //static const int UNLOCKED = 0;
    const int maxThreads;
    RIAtomicCounterArray ri {};
    //alignas(128) std::atomic<int> cohort { UNLOCKED };
    TicketLock cohort {};

public:
    CRWWP(const int maxThreads=MAX_THREADS) : maxThreads{maxThreads} { }

    std::string className() { return "C-RW-WP"; }

    void exclusiveLock() {
        cohort.lock();
        while (!ri.isEmpty()) Pause();
    }

    void exclusiveUnlock() {
        cohort.unlock();
    }

    void sharedLock() {
        while (true) {
            ri.arrive();
            if (!cohort.isLocked()) break;
            ri.depart();
            while (cohort.isLocked()) Pause();
        }
    }

    void sharedUnlock() {
        ri.depart();
    }
};

#endif /* _CRWWP_H_ */


================================================
FILE: ptms/rwlocks/CRWWP_SpinLock.hpp
================================================
/******************************************************************************
 * Copyright (c) 2014-2017, Pedro Ramalhete, Andreia Correia
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of Concurrency Freaks nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************************************************************
 */

#ifndef _CRWWP_Spin_H_
#define _CRWWP_Spin_H_

#include <atomic>
#include <stdexcept>
#include <cstdint>
#include <thread>
#include "../../common/ThreadRegistry.hpp"


// Pause to prevent excess processor bus usage
#if defined( __sparc )
#define Pause() __asm__ __volatile__ ( "rd %ccr,%g0" )
#elif defined( __i386 ) || defined( __x86_64 )
#define Pause() __asm__ __volatile__ ( "pause" : : : )
#else
#define Pause() std::this_thread::yield();
#endif


/**
 * <h1> C-RW-WP </h1>
 *
 * A C-RW-WP reader-writer lock with writer preference and using a
 * spin Lock as Cohort.
 *
 * C-RW-WP paper:         http://dl.acm.org/citation.cfm?id=2442532
 *
 * <p>
 * @author Pedro Ramalhete
 * @author Andreia Correia
 */
class CRWWPSpinLock {

private:
    class SpinLock {
        alignas(128) std::atomic<int> writers {0};
    public:
        bool isLocked() { return (writers.load()==1); }
        void lock() {
            while (!tryLock()) Pause();
        }
        bool tryLock() {
            if(writers.load()==1)return false;
            int tmp = 0;
            return writers.compare_exchange_strong(tmp,1);
        }
        void unlock() {
            writers.store(0, std::memory_order_release);
        }
    };

    class RIStaticPerThread {
    private:
        static const uint64_t NOT_READING = 0;
        static const uint64_t READING = 1;
        static const int CLPAD = 128/sizeof(uint64_t);
        static const int MAX_THREADS = 128;
        const int maxThreads;
        alignas(128) std::atomic<uint64_t>* states;

    public:
        RIStaticPerThread(int maxThreads=MAX_THREADS) : maxThreads{maxThreads} {
            states = new std::atomic<uint64_t>[maxThreads*CLPAD];
            for (int tid = 0; tid < maxThreads; tid++) {
                states[tid*CLPAD].store(NOT_READING, std::memory_order_relaxed);
            }
        }

        ~RIStaticPerThread() {
            delete[] states;
        }

        inline void arrive(const int tid) noexcept {
            states[tid*CLPAD].store(READING);
        }

        inline void depart(const int tid) noexcept {
            states[tid*CLPAD].store(NOT_READING, std::memory_order_release);
        }

        inline bool isEmpty() noexcept {
            const int maxTid = ThreadRegistry::getMaxThreads();
            for (int tid = 0; tid < maxTid; tid++) {
                if (states[tid*CLPAD].load() != NOT_READING) return false;
            }
            return true;
        }
    };

    static const int MAX_THREADS = 128;
    //static const int LOCKED = 1;
    //static const int UNLOCKED = 0;
    const int maxThreads;
    RIStaticPerThread ri {};
    //alignas(128) std::atomic<int> cohort { UNLOCKED };
    SpinLock splock {};

public:
    CRWWPSpinLock(const int maxThreads=MAX_THREADS) : maxThreads{maxThreads} { }

    std::string className() { return "C-RW-WP-SpinLock"; }

    void exclusiveLock() {
        splock.lock();
        while (!ri.isEmpty()) Pause();
    }

    bool tryExclusiveLock() {
        return splock.tryLock();
    }

    void exclusiveUnlock() {
        splock.unlock();
    }

    void sharedLock(const int tid) {
        while (true) {
            ri.arrive(tid);
            if (!splock.isLocked()) break;
            ri.depart(tid);
            while (splock.isLocked()) Pause();
        }
    }

    void sharedUnlock(const int tid) {
        ri.depart(tid);
    }

    void waitForReaders(){
        while (!ri.isEmpty()) {} // spin waiting for readers
    }
};

#endif /* _CRWWP_H_ */


================================================
FILE: stms/CRWWPSTM.hpp
================================================
/******************************************************************************
 * Copyright (c) 2017-2019, Pedro Ramalhete, Andreia Correia
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of Concurrency Freaks nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.

 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************************************************************
 */

#ifndef _C_RW_WP_TRANSACTIONAL_MEMORY_H_
#define _C_RW_WP_TRANSACTIONAL_MEMORY_H_

#include <atomic>
#include <thread>
#include <cassert>
#include <functional>

/* <h1> C-RW-WP Software Transactional Memory </h1>
 *
 * This is a Transactional Memory that uses C-RW-WP plus Flat-Combining.
 * It is blocking but with starvation-freedom.
 *
 * Transactions are irrevocable, therefore we don't provide an API for
 * aborting transactions.
 *
 * We have put everything in this header so that the end-user can include a
 * single header and gets a working Software Transaction Memory.
 * It isn't pretty, but it makes life easier for application developers.
 * And yes, it's kind of silly to make an STM out of a global lock, but we did
 * it to have a "baseline" against which to compare other STMs.
 *
 * C-RW-WP paper:                            http://dl.acm.org/citation.cfm?id=2442532
 * Flat Combining paper:                     http://dl.acm.org/citation.cfm?id=1810540
 * A post about C-RW-WP with Flat Combining: http://concurrencyfreaks.com/2017/07/left-right-and-c-rw-wp-with-flat.html
 *
 * We have the following classes in this file:
 * tmtype<T>         -> Annotation for TM types (not really needed but useful for the benchmarks)
 * CRWWPTM           -> The singleton TM
 *
 * How to use this TM:
 * crwwptm::tmtype<UserObject> obj {};
 * crwwptm::writeTx([&obj] () {obj.mutative_method()} );
 * crwwptm::readTx([&obj] () {obj.non_mutative_method()} );
 *
 * @author Pedro Ramalhete
 * @author Andreia Correia
 */
namespace crwwpstm {


// Needed by the TM benchmarks and tests infrastructure
template<typename T>
struct tmtype {
    T val {};

    tmtype() { }

    tmtype(T initVal) : val{initVal} {}

    // Casting operator
    operator T() {
        return load();
    }

    // Prefix increment operator: ++x
    void operator++ () {
        store(load()+1);
    }

    // Prefix decrement operator: --x
    void operator-- () {
        store(load()-1);
    }

    void operator++ (int) {
        store(load()+1);
    }

    void operator-- (int) {
        store(load()-1);
    }

    // Equals operator: first downcast to T and then compare
    bool operator == (const T& otherval) const {
        return load() == otherval;
    }

    // Difference operator: first downcast to T and then compare
    bool operator != (const T& otherval) const {
        return load() != otherval;
    }

    // Operator arrow ->
    T operator->() {
        return load();
    }

    // Copy constructor
    tmtype<T>(const tmtype<T>& other) {
        store(other.load());
    }

    // Assignment operator from an tmtype
    tmtype<T>& operator=(const tmtype<T>& other) {
        store(other.load());
        return *this;
    }

    // Assignment operator from a value
    tmtype<T>& operator=(T value) {
        store(value);
        return *this;
    }

    inline void store(T newVal) {
        val = newVal;
    }

    inline T load() const {
        return val;
    }
};


// Needed by the TM benchmarks and tests infrastructure
struct tmbase { };


//
// Thread Registry stuff
//
extern void thread_registry_deregister_thread(const int tid);

// An helper class to do the checkin and checkout of the thread registry
struct ThreadCheckInCheckOut {
    static const int NOT_ASSIGNED = -1;
    int tid { NOT_ASSIGNED };
    ~ThreadCheckInCheckOut() {
        if (tid == NOT_ASSIGNED) return;
        thread_registry_deregister_thread(tid);
    }
};

extern thread_local ThreadCheckInCheckOut tl_gc_tcico;

// Forward declaration of global/singleton instance
class ThreadRegistry;
extern ThreadRegistry gThreadRegistry;

/*
 * <h1> Registry for threads </h1>
 *
 * This is singleton type class that allows assignement of a unique id to each thread.
 * The first time a thread calls ThreadRegistry::getTID() it will allocate a free slot in 'usedTID[]'.
 * This tid wil be saved in a thread-local variable of the type ThreadCheckInCheckOut which
 * upon destruction of the thread will call the destructor of ThreadCheckInCheckOut and free the
 * corresponding slot to be used by a later thread.
 * RomulusLR relies on this to work properly.
 */
class ThreadRegistry {
public:
    static const int                    REGISTRY_MAX_THREADS = 128;

private:
    alignas(128) std::atomic<bool>      usedTID[REGISTRY_MAX_THREADS];   // Which TIDs are in use by threads
    alignas(128) std::atomic<int>       maxTid {-1};                     // Highest TID (+1) in use by threads

public:
    ThreadRegistry() {
        for (int it = 0; it < REGISTRY_MAX_THREADS; it++) {
            usedTID[it].store(false, std::memory_order_relaxed);
        }
    }

    // Progress Condition: wait-free bounded (by the number of threads)
    int register_thread_new(void) {
        for (int tid = 0; tid < REGISTRY_MAX_THREADS; tid++) {
            if (usedTID[tid].load(std::memory_order_acquire)) continue;
            bool unused = false;
            if (!usedTID[tid].compare_exchange_strong(unused, true)) continue;
            // Increase the current maximum to cover our thread id
            int curMax = maxTid.load();
            while (curMax <= tid) {
                maxTid.compare_exchange_strong(curMax, tid+1);
                curMax = maxTid.load();
            }
            tl_gc_tcico.tid = tid;
            return tid;
        }
        std::cout << "ERROR: Too many threads, registry can only hold " << REGISTRY_MAX_THREADS << " threads\n";
        assert(false);
    }

    // Progress condition: wait-free population oblivious
    inline void deregister_thread(const int tid) {
        usedTID[tid].store(false, std::memory_order_release);
    }

    // Progress condition: wait-free population oblivious
    static inline uint64_t getMaxThreads(void) {
        return gThreadRegistry.maxTid.load(std::memory_order_acquire);
    }

    // Progress condition: wait-free bounded (by the number of threads)
    static inline int getTID(void) {
        int tid = tl_gc_tcico.tid;
        if (tid != ThreadCheckInCheckOut::NOT_ASSIGNED) return tid;
        return gThreadRegistry.register_thread_new();
    }
};

// Forward declaration to be able to have static methods for updateTx() and readTx()
class CRWWPSTM;
extern CRWWPSTM gCRWWPSTM;


class CRWWPSTM {

private:
    class RIStaticPerThread {
    private:
        static const uint64_t NOT_READING = 0;
        static const uint64_t READING = 1;
        static const int CLPAD = 128/sizeof(uint64_t);
        const int maxThreads;
        alignas(128) std::atomic<uint64_t>* states;

    public:
        RIStaticPerThread(int maxThreads) : maxThreads{maxThreads} {
            states = new std::atomic<uint64_t>[maxThreads*CLPAD];
            for (int tid = 0; tid < maxThreads; tid++) {
                states[tid*CLPAD].store(NOT_READING, std::memory_order_relaxed);
            }
        }

        ~RIStaticPerThread() {
            delete[] states;
        }

        inline void arrive(const int tid) noexcept {
            states[tid*CLPAD].store(READING);
        }

        inline void depart(const int tid) noexcept {
            states[tid*CLPAD].store(NOT_READING, std::memory_order_release);
        }

        inline bool isEmpty() noexcept {
            for (int tid = 0; tid < ThreadRegistry::getMaxThreads(); tid++) {
                if (states[tid*CLPAD].load() != NOT_READING) return false;
            }
            return true;
        }
    };

    static const int CLPAD = 128/sizeof(uintptr_t);
    static const int MAX_THREADS = 1024;
    static const int LOCKED = 1;
    static const int UNLOCKED = 0;
    // Stuff use by the Flat Combining mechanism
    alignas(128) std::atomic< std::function<uint64_t()>* >* fc;
    alignas(128) uint64_t* results;
    alignas(128) std::atomic<int> cohort { UNLOCKED };
    RIStaticPerThread ri { MAX_THREADS };

public:
    CRWWPSTM(const int maxThreads=0) {
        fc = new std::atomic< std::function<uint64_t()>* >[MAX_THREADS*CLPAD];
        results = new uint64_t[MAX_THREADS*CLPAD];
        for (int i = 0; i < MAX_THREADS; i++) {
            fc[i*CLPAD].store(nullptr, std::memory_order_relaxed);
        }
    }

    ~CRWWPSTM() {
        delete[] fc;
        delete[] results;
    }

    static std::string className() { return "CRWWPSTM"; }

    template<typename R, typename F> static R updateTx(F&& func) { return gCRWWPSTM.ns_updateTx<R>(func); }
    template<typename R, typename F> static R readTx(F&& func) { return gCRWWPSTM.ns_readTx<R>(func); }
    template<typename F> static void updateTx(F&& func) { gCRWWPSTM.ns_updateTx(func); }
    template<typename F> static void readTx(F&& func) { gCRWWPSTM.ns_readTx(func); }


    // Progress: Blocking (starvation-free)
    template<typename R, typename F> R ns_updateTx(F&& mutativeFunc) {
        const int tid = ThreadRegistry::getTID();
        std::function<R()> uf = {mutativeFunc}; // Use a stack allocated std::function instead of heap allocated
        std::function<uint64_t()>* myfunc = (std::function<uint64_t()>*)&uf;
        // Add our mutation to the array of flat combining
        fc[tid*CLPAD].store(myfunc, std::memory_order_release);
        // Lock writersMutex
        while (true) {
            int unlocked = UNLOCKED;
            if (cohort.load() == UNLOCKED &&
                cohort.compare_exchange_strong(unlocked, LOCKED)) break;
            // Check if another thread executed my mutation
            if (fc[tid*CLPAD].load(std::memory_order_acquire) == nullptr) return (R)(results[tid*CLPAD]);
            std::this_thread::yield();
        }
        // Wait for all the readers to depart
        while (!ri.isEmpty()) {
            // Check if another thread executed my mutation
            if (fc[tid*CLPAD].load(std::memory_order_acquire) == nullptr) {
                cohort.store(UNLOCKED, std::memory_order_release);
                return (R)results[tid*CLPAD];
            }
            // spin
        }
        // Save a local copy of the flat combining array
        bool somethingToDo = false;
        const int maxThreads = ThreadRegistry::getMaxThreads();
        std::function<uint64_t()>* lfc[maxThreads];
        for (int i = 0; i < maxThreads; i++) {
            lfc[i] = fc[i*CLPAD].load(std::memory_order_acquire);
            if (lfc[i] != nullptr) somethingToDo = true;
        }
        // Check if there is at least one operation to apply
        if (somethingToDo) {
            // For each mutation in the flat combining array, apply it in the order
            // of the array, save the result, and set the entry in the array to nullptr
            for (int i = 0; i < maxThreads; i++) {
                auto mutation = fc[i*CLPAD].load(std::memory_order_acquire);
                if (mutation == nullptr) continue;
                results[i*CLPAD] = (uint64_t)((*mutation)());
                fc[i*CLPAD].store(nullptr, std::memory_order_release);
            }
        }
        // unlock()
        cohort.store(UNLOCKED, std::memory_order_release);
        return (R)(results[tid*CLPAD]);
    }

    // Progress: Blocking (starvation-free)
    template<typename F> void ns_updateTx(F&& mutativeFunc) {
        const int tid = ThreadRegistry::getTID();
        std::function<void()> vf {mutativeFunc};  // Use a stack allocated std::function instead of heap allocated
        std::function<uint64_t()>* myfunc = (std::function<uint64_t()>*)&vf;
        // Add our mutation to the array of flat combining
        fc[tid*CLPAD].store(myfunc);
        // Lock writersMutex
        while (true) {
            int unlocked = UNLOCKED;
            if (cohort.load() == UNLOCKED &&
                cohort.compare_exchange_strong(unlocked, LOCKED)) break;
            // Check if another thread executed my mutation
            if (fc[tid*CLPAD].load(std::memory_order_acquire) == nullptr) return;
            std::this_thread::yield();
        }
        // Wait for all the readers to depart
        while (!ri.isEmpty()) {
            // Check if another thread executed my mutation
            if (fc[tid*CLPAD].load(std::memory_order_acquire) == nullptr) {
                cohort.store(UNLOCKED, std::memory_order_release);
                return;
            }
            // spin
        }
        // Save a local copy of the flat combining array
        bool somethingToDo = false;
        const int maxThreads = ThreadRegistry::getMaxThreads();
        std::function<uint64_t()>* lfc[maxThreads];
        for (int i = 0; i < maxThreads; i++) {
            lfc[i] = fc[i*CLPAD].load(std::memory_order_acquire);
            if (lfc[i] != nullptr) somethingToDo = true;
        }
        // Check if there is at least one operation to apply
        if (somethingToDo) {
            // For each mutation in the flat combining array, apply it in the order
            // of the array, save the result, and set the entry in the array to nullptr
            for (int i = 0; i < maxThreads; i++) {
                auto mutation = fc[i*CLPAD].load(std::memory_order_acquire);
                if (mutation == nullptr) continue;
                results[i*CLPAD] = (uint64_t)((*mutation)());
                fc[i*CLPAD].store(nullptr, std::memory_order_release);
            }
        }
        // unlock()
        cohort.store(UNLOCKED, std::memory_order_release);
    }

    // Progress: Blocking (starvation-free)
    template<typename R, typename F> R ns_readTx(F&& readFunc) {
        const int tid = ThreadRegistry::getTID();
        bool announced = false;
        std::function<uint64_t()> myfunc = readFunc;
        // lock()
        while (true) {
            ri.arrive(tid);
            if (cohort.load() == UNLOCKED) break;
            ri.depart(tid);
            if (!announced) {
                // Put my operation in the flat combining array for a Writer to do it
                fc[tid*CLPAD].store(&myfunc, std::memory_order_release);
                announced = true;
            }
            // If a Writer set our entry to nullptr then the result is ready
            while (cohort.load() == LOCKED) {
                if (fc[tid*CLPAD].load(std::memory_order_acquire) == nullptr) {
                    return (R)(results[tid*CLPAD]);
                }
                std::this_thread::yield();  // pause()
            }
        }
        // Execute our read-only function
        R result = readFunc();
        if (announced) fc[tid*CLPAD].store(nullptr, std::memory_order_relaxed);
        // unlock()
        ri.depart(tid);
        return result;
    }

    template<typename F> void ns_readTx(F&& readFunc) {
        const int tid = ThreadRegistry::getTID();
        bool announced = false;
        std::function<void()> myvoidfunc = readFunc;
        std::function<uint64_t()> *myfunc = (std::function<uint64_t()>*)&myvoidfunc;
        // lock()
        while (true) {
            ri.arrive(tid);
            if (cohort.load() == UNLOCKED) break;
            ri.depart(tid);
            if (!announced) {
                // Put my operation in the flat combining array for a Writer to do it
                fc[tid*CLPAD].store(myfunc, std::memory_order_release);
                announced = true;
            }
            // If a Writer set our entry to nullptr then the result is ready
            while (cohort.load() == LOCKED) {
                if (fc[tid*CLPAD].load(std::memory_order_acquire) == nullptr) {
                    return;
                }
                std::this_thread::yield();  // pause()
            }
        }
        // Execute our read-only function
        readFunc();
        if (announced) fc[tid*CLPAD].store(nullptr, std::memory_order_relaxed);
        // unlock()
        ri.depart(tid);
    }


    template <typename T, typename... Args>
    T* tmNew(Args&&... args) {
        return new T(std::forward<Args>(args)...);
    }

    template<typename T>
    void tmDelete(T* obj) {
        delete obj;
    }

    // We snap a tmbase at the beginning of the allocation
    static void* tmMalloc(size_t size) {
        uint8_t* ptr = (uint8_t*)std::malloc(size+sizeof(tmbase));
        return ptr + sizeof(tmbase);
    }

    // We assume there is a tmbase allocated in the beginning of the allocation
    static void tmFree(void* obj) {
        if (obj == nullptr) return;
        uint8_t* ptr = (uint8_t*)obj - sizeof(tmbase);
        std::free(ptr);  // Outside a transaction, just free the object
    }

};


extern CRWWPSTM gCRWWPSTM;


template<typename R, typename F> static R updateTx(F&& func) { return gCRWWPSTM.updateTx<R>(func); }
template<typename R, typename F> static R readTx(F&& func) { return gCRWWPSTM.readTx<R>(func); }
template<typename F> static void updateTx(F&& func) { gCRWWPSTM.updateTx(func); }
template<typename F> static void readTx(F&& func) { gCRWWPSTM.readTx(func); }
template<typename T, typename... Args> static T* tmNew(Args&&... args) { return gCRWWPSTM.tmNew<T>(args...); }
template<typename T> static void tmDelete(T* obj) { gCRWWPSTM.tmDelete<T>(obj); }
inline void* tmMalloc(size_t size) { return CRWWPSTM::tmMalloc(size); }
inline void* tmFree(void* obj) { CRWWPSTM::tmFree(obj); }


//
// Place these in a .cpp if you include this header in multiple files
//
CRWWPSTM gCRWWPSTM {};
// Global/singleton to hold all the thread registry functionality
ThreadRegistry gThreadRegistry {};
// This is where every thread stores the tid it has been assigned when it calls getTID() for the first time.
// When the thread dies, the destructor of ThreadCheckInCheckOut will be called and de-register the thread.
thread_local ThreadCheckInCheckOut tl_gc_tcico {};
// Helper function for thread de-registration
void thread_registry_deregister_thread(const int tid) {
    gThreadRegistry.deregister_thread(tid);
}

} // end of namespace crwwp

#endif /* _C_RW_WP_TRANSACTIONAL_MEMORY_H_ */


================================================
FILE: stms/ESTM.hpp
================================================
/*
 * Copyright 2017-2018
 *   Andreia Correia <andreia.veiga@unine.ch>
 *   Pedro Ramalhete <pramalhe@gmail.com>
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Nachshon Cohen <nachshonc@gmail.com>
 *
 * This work is published under the MIT license. See LICENSE.txt
 */
#ifndef _ELASTIC_STM_TRANSACTIONAL_MEMORY_WRAPPER_H_
#define _ELASTIC_STM_TRANSACTIONAL_MEMORY_WRAPPER_H_

#include <atomic>
#include <cassert>
#include <iostream>
#include <vector>
#include <functional>


namespace estm {

// Compile with explicit calls to ESTM
#include "estm-0.3.0/include/stm.h"
#include "estm-0.3.0/include/mod_mem.h"


struct tmbase {
};


//
// Thread Registry stuff
//
extern void thread_registry_deregister_thread(const int tid);

// An helper class to do the checkin and checkout of the thread registry
struct ThreadCheckInCheckOut {
    static const int NOT_ASSIGNED = -1;
    int tid { NOT_ASSIGNED };
    ~ThreadCheckInCheckOut() {
        if (tid == NOT_ASSIGNED) return;
        thread_registry_deregister_thread(tid);
    }
};

extern thread_local ThreadCheckInCheckOut tl_gc_tcico;

// Forward declaration of global/singleton instance
class ThreadRegistry;
extern ThreadRegistry gThreadRegistry;

/*
 * <h1> Registry for threads </h1>
 *
 * This is singleton type class that allows assignement of a unique id to each thread.
 * The first time a thread calls ThreadRegistry::getTID() it will allocate a free slot in 'usedTID[]'.
 * This tid wil be saved in a thread-local variable of the type ThreadCheckInCheckOut which
 * upon destruction of the thread will call the destructor of ThreadCheckInCheckOut and free the
 * corresponding slot to be used by a later thread.
 * RomulusLR relies on this to work properly.
 */
class ThreadRegistry {
public:
    static const int                    REGISTRY_MAX_THREADS = 128;

private:
    alignas(128) std::atomic<bool>      usedTID[REGISTRY_MAX_THREADS];   // Which TIDs are in use by threads
    alignas(128) std::atomic<int>       maxTid {-1};                     // Highest TID (+1) in use by threads

public:
    ThreadRegistry() {
        for (int it = 0; it < REGISTRY_MAX_THREADS; it++) {
            usedTID[it].store(false, std::memory_order_relaxed);
        }
    }

    // Progress Condition: wait-free bounded (by the number of threads)
    int register_thread_new(void) {
        for (int tid = 0; tid < REGISTRY_MAX_THREADS; tid++) {
            if (usedTID[tid].load(std::memory_order_acquire)) continue;
            bool unused = false;
            if (!usedTID[tid].compare_exchange_strong(unused, true)) continue;
            // Increase the current maximum to cover our thread id
            int curMax = maxTid.load();
            while (curMax <= tid) {
                maxTid.compare_exchange_strong(curMax, tid+1);
                curMax = maxTid.load();
            }
            tl_gc_tcico.tid = tid;
            return tid;
        }
        std::cout << "ERROR: Too many threads, registry can only hold " << REGISTRY_MAX_THREADS << " threads\n";
        assert(false);
    }

    // Progress condition: wait-free population oblivious
    inline void deregister_thread(const int tid) {
        stm_exit_thread();    // Needed by ESTM
        usedTID[tid].store(false, std::memory_order_release);
    }

    // Progress condition: wait-free population oblivious
    static inline uint64_t getMaxThreads(void) {
        return gThreadRegistry.maxTid.load(std::memory_order_acquire);
    }

    // Progress condition: wait-free bounded (by the number of threads)
    static inline int getTID(void) {
        int tid = tl_gc_tcico.tid;
        if (tid != ThreadCheckInCheckOut::NOT_ASSIGNED) return tid;
        stm_init_thread();   // Needed by TinySTM
        return gThreadRegistry.register_thread_new();
    }
};


class ESTM;
extern ESTM gESTM;

class ESTM {

private:
    // Maximum number of participating threads
    static const uint64_t MAX_THREADS = 128;

public:
    ESTM(unsigned int maxThreads=MAX_THREADS) {
        stm_init();
        mod_mem_init();
    }

    ~ESTM() {
        stm_exit();
    }

    static std::string className() { return "ESTM"; }

    template<typename R, class F>
    static R updateTx(F&& func) {
        ThreadRegistry::getTID();
        sigjmp_buf *_e = stm_get_env();
        sigsetjmp(*_e, 0);
        stm_start(_e, 0, NL);
        R retval = func();
        stm_commit();
        return retval;
    }

    template<class F>
    static void updateTx(F&& func) {
        ThreadRegistry::getTID();
        sigjmp_buf *_e = stm_get_env();
        sigsetjmp(*_e, 0);
        stm_start(_e, 0, NL);
        func();
        stm_commit();
    }

    template<typename R, class F>
    static R readTx(F&& func) {
        ThreadRegistry::getTID();
        sigjmp_buf *_e = stm_get_env();
        sigsetjmp(*_e, 0);
        stm_start(_e, 0, NL);
        R retval = func();
        stm_commit();
        return retval;
    }

    template<class F>
    static void readTx(F&& func) {
        ThreadRegistry::getTID();
        sigjmp_buf *_e = stm_get_env();
        sigsetjmp(*_e, 0);
        stm_start(_e, 0, NL);
        func();
        stm_commit();
    }

    template <typename T, typename... Args>
    static T* tmNew(Args&&... args) {
        void* addr = stm_malloc(sizeof(T));
        assert(addr != NULL);
        T* ptr = new (addr) T(std::forward<Args>(args)...);
        return ptr;
    }

    template<typename T>
    static void tmDelete(T* obj) {
        if (obj == nullptr) return;
        obj->~T();
        // Reset memory block to avoid allocator metadata corruption
        // TODO: be careful with the last word, the object better be a word multiple otherwise this can corrupt data from the next object
        stm_word_t* wptr = (stm_word_t *)obj;
        for (int i = 0; i < sizeof(T)/sizeof(stm_word_t) ;i++, wptr++) stm_store(wptr, 0);
        stm_free(obj, sizeof(T));
    }

    static void* tmMalloc(size_t size) {
        return stm_malloc(size);
    }

    static void tmFree(void* obj) {
        stm_free(obj, 0); // TODO: is it ok to pass zero here?
    }
};


// T is typically a pointer to a node, but it can be integers or other stuff, as long as it fits in 64 bits
template<typename T>
struct tmtype {
    T val {};

    tmtype() { }

    tmtype(T initVal) : val{initVal} {}

    // Casting operator
    operator T() {
        return load();
    }

    // Prefix increment operator: ++x
    void operator++ () {
        store(load()+1);
    }

    // Prefix decrement operator: --x
    void operator-- () {
        store(load()-1);
    }

    void operator++ (int) {
        store(load()+1);
    }

    void operator-- (int) {
        store(load()-1);
    }

    // Equals operator: first downcast to T and then compare
    bool operator == (const T& otherval) const {
        return load() == otherval;
    }

    // Difference operator: first downcast to T and then compare
    bool operator != (const T& otherval) const {
        return load() != otherval;
    }

    // Relational operators
    bool operator < (const T& rhs) {
        return load() < rhs;
    }
    bool operator > (const T& rhs) {
        return load() > rhs;
    }
    bool operator <= (const T& rhs) {
        return load() <= rhs;
    }
    bool operator >= (const T& rhs) {
        return load() >= rhs;
    }

    // Operator arrow ->
    T operator->() {
        return load();
    }

    // Copy constructor
    tmtype<T>(const tmtype<T>& other) {
        store(other.load());
    }

    // Assignment operator from an tmtype
    tmtype<T>& operator=(const tmtype<T>& other) {
        store(other.load());
        return *this;
    }

    // Assignment operator from a value
    tmtype<T>& operator=(T value) {
        store(value);
        return *this;
    }

    inline void store(T newVal) {
        stm_store((stm_word_t *)&val, (stm_word_t)newVal);
    }

    // Meant to be called when know we're the only ones touching
    // these contents, for example, in the constructor of an object, before
    // making the object visible to other threads.
    inline void isolated_store(T newVal) {
        val = newVal;
    }

    inline T load() const {
        return (T)stm_load((stm_word_t *)&val);
    }
};

extern ESTM gESTM;

// Wrapper methods to the global TM instance. The user should use these:
template<typename R, typename F> R updateTx(F&& mutativeFunc) { return gESTM.updateTx<R>(mutativeFunc); }
template<typename R, typename F> R readTx(F&& readFunc) { return gESTM.readTx<R>(readFunc); }
template<typename F> void updateTx(F&& mutativeFunc) { gESTM.updateTx(mutativeFunc); }
template<typename F> void readTx(F&& readFunc) { gESTM.readTx(readFunc); }

// Wrapper to not do any transaction
template<typename R, typename Func>
R notx(const int tid, Func&& func) {
    return func();
}

template<typename T, typename... Args> T* tmNew(Args&&... args) { return gESTM.tmNew<T>(args...); }
template<typename T> void tmDelete(T* obj) { gESTM.tmDelete<T>(obj); }
static void* tmMalloc(size_t size) { return stm_malloc(size); }
static void tmFree(void* obj) { stm_free(obj, 0); } // TODO: is it ok to pass zero here?

static int getTID(void) { return ThreadRegistry::getTID(); }


//
// Place these in a .cpp if you include this header in multiple files
//
ESTM gESTM {};
// Global/singleton to hold all the thread registry functionality
ThreadRegistry gThreadRegistry {};
// This is where every thread stores the tid it has been assigned when it calls getTID() for the first time.
// When the thread dies, the destructor of ThreadCheckInCheckOut will be called and de-register the thread.
thread_local ThreadCheckInCheckOut tl_gc_tcico {};
// Helper function for thread de-registration
void thread_registry_deregister_thread(const int tid) {
    gThreadRegistry.deregister_thread(tid);
}


}

#endif /* _ELASTIC_STM_TRANSACTIONAL_MEMORY_WRAPPER_H_ */


================================================
FILE: stms/OneFileLF.hpp
================================================
/*
 * Copyright 2017-2019
 *   Andreia Correia <andreia.veiga@unine.ch>
 *   Pedro Ramalhete <pramalhe@gmail.com>
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Nachshon Cohen <nachshonc@gmail.com>
 *
 * This work is published under the MIT license. See LICENSE.txt
 */
#ifndef _ONE_FILE_LOCK_FREE_TRANSACTIONAL_MEMORY_WITH_HAZARD_ERAS_H_
#define _ONE_FILE_LOCK_FREE_TRANSACTIONAL_MEMORY_WITH_HAZARD_ERAS_H_

#include <atomic>
#include <cassert>
#include <iostream>
#include <vector>
#include <functional>
#include <cstring>
#include <cstdint>   // Needed by uint64_t

// Please keep this file in sync (as much as possible) with ptms/POneFileLF.hpp

namespace oflf {

//
// User configurable variables.
// Feel free to change these if you need larger transactions, more allocations per transacation, or more threads.
//

// Maximum number of registered threads that can execute transactions
static const int REGISTRY_MAX_THREADS = 128;
// Maximum number of stores in the WriteSet per transaction
static const uint64_t TX_MAX_STORES = 10*1024;
// Number of buckets in the hashmap of the WriteSet.
static const uint64_t HASH_BUCKETS = 1024;
// Maximum number of allocations in one transaction
static const uint64_t TX_MAX_ALLOCS = 10*1024;
// Maximum number of deallocations in one transaction
static const uint64_t TX_MAX_RETIRES = 10*1024;


// DCAS / CAS2 macro
#define DCAS(ptr, o1, o2, n1, n2)                               \
({                                                              \
    char __ret;                                                 \
    __typeof__(o2) __junk;                                      \
    __typeof__(*(ptr)) __old1 = (o1);                           \
    __typeof__(o2) __old2 = (o2);                               \
    __typeof__(*(ptr)) __new1 = (n1);                           \
    __typeof__(o2) __new2 = (n2);                               \
    asm volatile("lock cmpxchg16b %2;setz %1"                   \
                   : "=d"(__junk), "=a"(__ret), "+m" (*ptr)     \
                   : "b"(__new1), "c"(__new2),                  \
                     "a"(__old1), "d"(__old2));                 \
    __ret; })


// Functions to convert between a transaction identifier (uint64_t) and a pair of {sequence,index}
static inline uint64_t seqidx2trans(uint64_t seq, uint64_t idx) {
    return (seq << 10) | idx;
}
static inline uint64_t trans2seq(uint64_t trans) {
    return trans >> 10;
}
static inline uint64_t trans2idx(uint64_t trans) {
    return trans & 0x3FF; // 10 bits
}


//
// Thread Registry stuff
//
extern void thread_registry_deregister_thread(const int tid);

// An helper class to do the checkin and checkout of the thread registry
struct ThreadCheckInCheckOut {
    static const int NOT_ASSIGNED = -1;
    int tid { NOT_ASSIGNED };
    ~ThreadCheckInCheckOut() {
        if (tid == NOT_ASSIGNED) return;
        thread_registry_deregister_thread(tid);
    }
};

extern thread_local ThreadCheckInCheckOut tl_tcico;

// Forward declaration of global/singleton instance
class ThreadRegistry;
extern ThreadRegistry gThreadRegistry;

/*
 * <h1> Registry for threads </h1>
 *
 * This is singleton type class that allows assignement of a unique id to each thread.
 * The first time a thread calls ThreadRegistry::getTID() it will allocate a free slot in 'usedTID[]'.
 * This tid wil be saved in a thread-local variable of the type ThreadCheckInCheckOut which
 * upon destruction of the thread will call the destructor of ThreadCheckInCheckOut and free the
 * corresponding slot to be used by a later thread.
 */
class ThreadRegistry {
private:
    alignas(128) std::atomic<bool>      usedTID[REGISTRY_MAX_THREADS];   // Which TIDs are in use by threads
    alignas(128) std::atomic<int>       maxTid {-1};                     // Highest TID (+1) in use by threads

public:
    ThreadRegistry() {
        for (int it = 0; it < REGISTRY_MAX_THREADS; it++) {
            usedTID[it].store(false, std::memory_order_relaxed);
        }
    }

    // Progress condition: wait-free bounded (by the number of threads)
    int register_thread_new(void) {
        for (int tid = 0; tid < REGISTRY_MAX_THREADS; tid++) {
            if (usedTID[tid].load(std::memory_order_acquire)) continue;
            bool unused = false;
            if (!usedTID[tid].compare_exchange_strong(unused, true)) continue;
            // Increase the current maximum to cover our thread id
            int curMax = maxTid.load();
            while (curMax <= tid) {
                maxTid.compare_exchange_strong(curMax, tid+1);
                curMax = maxTid.load();
            }
            tl_tcico.tid = tid;
            return tid;
        }
        std::cout << "ERROR: Too many threads, registry can only hold " << REGISTRY_MAX_THREADS << " threads\n";
        assert(false);
    }

    // Progress condition: wait-free population oblivious
    inline void deregister_thread(const int tid) {
        usedTID[tid].store(false, std::memory_order_release);
    }

    // Progress condition: wait-free population oblivious
    static inline uint64_t getMaxThreads(void) {
        return gThreadRegistry.maxTid.load(std::memory_order_acquire);
    }

    // Progress condition: wait-free bounded (by the number of threads)
    static inline int getTID(void) {
        int tid = tl_tcico.tid;
        if (tid != ThreadCheckInCheckOut::NOT_ASSIGNED) return tid;
        return gThreadRegistry.register_thread_new();
    }
};


// Each object tracked by Hazard Eras needs to have tmbase as one of its base classes.
struct tmbase {
    uint64_t newEra_ {0};        // Filled by tmNew() or tmMalloc()
    uint64_t delEra_ {0};        // Filled by tmDelete() or tmFree()
};


// One entry in the log of allocations (not used for retires like in the WF version).
// In case the transactions aborts, we can rollback our allocations, hiding the type information inside the lambda.
// Sure, we could keep everything in std::function, but this uses less memory.
struct Deletable {
    void* obj {nullptr};         // Pointer to object to be deleted
    void (*reclaim)(void*);      // A wrapper to keep the type of the underlying object
};


// This is a specialized implementation of Hazard Eras meant to be used in the OneFile STM.
// Hazard Eras is a lock-free memory reclamation technique described here:
// https://github.com/pramalhe/ConcurrencyFreaks/blob/master/papers/hazarderas-2017.pdf
// https://dl.acm.org/citation.cfm?id=3087588
//
// We're using OneFileLF::curTx.seq as the global era.
class HazardErasOF {
private:
    static const uint64_t                    NOERA = 0;
    static const int                         CLPAD = 128/sizeof(std::atomic<uint64_t>);
    static const int                         THRESHOLD_R = 0; // This is named 'R' in the HP paper
    alignas(128) std::atomic<uint64_t>*      he;
    // It's not nice that we have a lot of empty vectors, but we need padding to avoid false sharing
    alignas(128) std::vector<tmbase*>        retiredList[REGISTRY_MAX_THREADS*CLPAD];

public:
    HazardErasOF() {
        he = new std::atomic<uint64_t>[REGISTRY_MAX_THREADS*CLPAD];
        for (unsigned it = 0; it < REGISTRY_MAX_THREADS; it++) {
            he[it*CLPAD].store(NOERA, std::memory_order_relaxed);
            retiredList[it*CLPAD].reserve(REGISTRY_MAX_THREADS);  // We pre-reserve one object per thread, should be enough to start
        }
    }

    ~HazardErasOF() {
        // Clear the objects in the retired lists
        for (unsigned it = 0; it < REGISTRY_MAX_THREADS; it++) {
            for (unsigned iret = 0; iret < retiredList[it*CLPAD].size(); iret++) {
                tmbase* del = retiredList[it*CLPAD][iret];
                std::free(del);
                // No need to call destructor because it was already executed as part of the transaction
            }
        }
        delete[] he;
    }

    // Progress condition: wait-free population oblivious
    inline void clear(const int tid) {
        he[tid*CLPAD].store(NOERA, std::memory_order_release);
    }

    // Progress condition: wait-free population oblivious
    inline void set(uint64_t trans, const int tid) {
        he[tid*CLPAD].store(trans2seq(trans));
    }

    // Progress condition: wait-free population oblivious
    inline void addToRetiredList(tmbase* newdel, const int tid) {
        retiredList[tid*CLPAD].push_back(newdel);
    }

    /**
     * Progress condition: bounded wait-free
     *
     * Attemps to delete the no-longer-in-use objects in the retired list.
     * We need to pass the currEra coming from the seq of the currTx so that
     * the objects from the current transaction don't get deleted.
     *
     * TODO: consider using erase() with std::remove_if()
     */
    void clean(uint64_t curEra, const int tid) {
        if (retiredList[tid*CLPAD].size() < THRESHOLD_R) return;
        for (unsigned iret = 0; iret < retiredList[tid*CLPAD].size();) {
            tmbase* del = retiredList[tid*CLPAD][iret];
            if (canDelete(curEra, del)) {
                retiredList[tid*CLPAD].erase(retiredList[tid*CLPAD].begin() + iret);
                std::free(del);
                // No need to call destructor because it was executed as part of the transaction
                continue;
            }
            iret++;
        }
    }

    // Progress condition: wait-free bounded (by the number of threads)
    inline bool canDelete(uint64_t curEra, tmbase* del) {
        // We can't delete objects from the current transaction
        if (del->delEra_ == curEra) return false;
        for (unsigned it = 0; it < ThreadRegistry::getMaxThreads(); it++) {
            const auto era = he[it*CLPAD].load(std::memory_order_acquire);
            if (era == NOERA || era < del->newEra_ || era > del->delEra_) continue;
            return false;
        }
        return true;
    }
};


// We need to split the contents from the methods due to compilation dependencies
template<typename T> struct tmtypebase {
    // Stores the actual value as an atomic
    alignas(16) std::atomic<uint64_t>  val;
    // Lets hope this comes immediately after 'val' in memory mapping, otherwise the DCAS() will fail
    alignas(8)  std::atomic<uint64_t>  seq {1};
};


// A single entry in the write-set
struct WriteSetEntry {
    void*          addr {nullptr};  // Address of value+sequence to change
    uint64_t       val;             // Desired value to change to
    WriteSetEntry* next {nullptr};  // Pointer to next node in the (intrusive) hash map
};

extern thread_local bool tl_is_read_only;


// The write-set is a log of the words modified during the transaction.
// This log is an array with an intrusive hashmap of size HASH_BUCKETS.
struct WriteSet {
    static const uint64_t MAX_ARRAY_LOOKUP = 30;  // Beyond this, it seems to be faster to use the hashmap
    WriteSetEntry*        buckets[HASH_BUCKETS];  // Intrusive HashMap for fast lookup in large(r) transactions
    uint64_t              numStores {0};          // Number of stores in the writeSet for the current transaction
    WriteSetEntry         log[TX_MAX_STORES];     // Redo log of stores

    WriteSet() {
        numStores = 0;
        for (unsigned i = 0; i < HASH_BUCKETS; i++) buckets[i] = &log[TX_MAX_STORES-1];
    }

    // Each address on a different bucket
    inline uint64_t hash(const void* addr) const {
        return (((uint64_t)addr) >> 3) % HASH_BUCKETS;
    }

    // Adds a modification to the redo log
    inline void addOrReplace(void* addr, uint64_t val) {
        if (tl_is_read_only) tl_is_read_only = false;
        const uint64_t hashAddr = hash(addr);
        if (numStores < MAX_ARRAY_LOOKUP) {
            // Lookup in array
            for (unsigned int idx = 0; idx < numStores; idx++) {
                if (log[idx].addr == addr) {
                    log[idx].val = val;
                    return;
                }
            }
        } else {
            // Lookup in hashmap
            WriteSetEntry* be = buckets[hash(addr)];
            if (be < &log[numStores]) {
                while (be != nullptr) {
                    if (be->addr == addr) {
                        be->val = val;
                        return;
                    }
                    be = be->next;
                }
            }
        }
        // Add to array
        WriteSetEntry* e = &log[numStores++];
        assert(numStores < TX_MAX_STORES);
        e->addr = addr;
        e->val = val;
        // Add to hashmap
        WriteSetEntry* be = buckets[hashAddr];
        // Clear if entry is from previous tx
        e->next = (be < e && hash(be->addr) == hashAddr) ? be : nullptr;
        buckets[hashAddr] = e;
    }

    // Does a lookup on the WriteSet for an addr.
    // If the numStores is lower than MAX_ARRAY_LOOKUP, the lookup is done on the log, otherwise, the lookup is done on the hashmap.
    // If it's not in the write-set, return lval.
    inline uint64_t lookupAddr(const void* addr, uint64_t lval) {
        if (numStores < MAX_ARRAY_LOOKUP) {
            // Lookup in array
            for (unsigned int idx = 0; idx < numStores; idx++) {
                if (log[idx].addr == addr) return log[idx].val;
            }
        } else {
            // Lookup in hashmap
            const uint64_t hashAddr = hash(addr);
            WriteSetEntry* be = buckets[hashAddr];
            if (be < &log[numStores] && hash(be->addr) == hashAddr) {
                while (be != nullptr) {
                    if (be->addr == addr) return be->val;
                    be = be->next;
                }
            }
        }
        return lval;
    }

    // Assignment operator, used when making a copy of a WriteSet to help another thread
    WriteSet& operator = (const WriteSet &other) {
        numStores = other.numStores;
        for (uint64_t i = 0; i < numStores; i++) log[i] = other.log[i];
        return *this;
    }

    // Applies all entries in the log as DCASes.
    // Seq must match for DCAS to succeed. This method is on the "hot-path".
    inline void apply(uint64_t seq, const int tid) {
        for (uint64_t i = 0; i < numStores; i++) {
            // Use an heuristic to give each thread 8 consecutive DCAS to apply
            WriteSetEntry& e = log[(tid*8 + i) % numStores];
            tmtypebase<uint64_t>* tmte = (tmtypebase<uint64_t>*)e.addr;
            uint64_t lval = tmte->val.load(std::memory_order_acquire);
            uint64_t lseq = tmte->seq.load(std::memory_order_acquire);
            if (lseq < seq) DCAS((uint64_t*)e.addr, lval, lseq, e.val, seq);
        }
    }
};


// Forward declaration
struct OpData;
// This is used by addOrReplace() to know which OpData instance to use for the current transaction
extern thread_local OpData* tl_opdata;


// Its purpose is to hold thread-local data
struct OpData {
    uint64_t              curTx {0};                   // Used during a transaction to keep the value of currTx read in beginTx() (owner thread only)
    std::atomic<uint64_t> request {0};                 // Can be moved to CLOSED by other threads, using a CAS
    uint64_t              nestedTrans {0};             // Thread-local: Number of nested transactions
    uint64_t              numRetires {0};              // Number of calls to retire() in this transaction (owner thread only)
    tmbase*               rlog[TX_MAX_RETIRES];        // List of retired objects during the transaction (owner thread only)
    uint64_t              numAllocs {0};               // Number of calls to tmNew() in this transaction (owner thread only)
    Deletable             alog[TX_MAX_ALLOCS];         // List of newly allocated objects during the transaction (owner thread only)
};


// Used to identify aborted transactions
struct AbortedTx {};
static constexpr AbortedTx AbortedTxException {};

class OneFileLF;
extern OneFileLF gOFLF;


/**
 * <h1> One-File STM (Lock-Free) </h1>
 *
 * OneFile is a Software Transacional Memory with lock-free progress, meant to
 * implement lock-free data structures. It has integrated lock-free memory
 * reclamation using Hazard Eras: https://dl.acm.org/citation.cfm?id=3087588
 *
 * OneFile is a word-based STM and it uses double-compare-and-swap (DCAS).
 *
 * Right now it has several limitations, some will be fixed in the future, some may be hard limitations of this approach:
 * - We can't have stack allocated tmtype<> variables. For example, we can't created inside a transaction "tmtpye<uint64_t> tmp = a;",
 *   it will give weird errors because of stack allocation.
 * - We need DCAS but it can be emulated with LL/SC or even with single-word CAS
 *   if we do redirection to a (lock-free) pool with SeqPtrs;
 */
class OneFileLF {
private:
    static const bool                    debug = false;
    HazardErasOF                         he {};
    OpData                              *opData;

public:
    std::atomic<uint64_t>                pad0[16];  // two cache lines of padding, before and after curTx
    std::atomic<uint64_t>                curTx {seqidx2trans(1,0)};
    std::atomic<uint64_t>                pad1[15];
    WriteSet                            *writeSets;                    // Two write-sets for each thread

    OneFileLF() {
        opData = new OpData[REGISTRY_MAX_THREADS];
        writeSets = new WriteSet[REGISTRY_MAX_THREADS];
    }

    ~OneFileLF() {
        delete[] opData;
        delete[] writeSets;
    }

    static std::string className() { return "OneFileSTM-LF"; }

    // Progress Condition: lock-free
    // The while-loop retarts only if there was at least one other thread completing a transaction
    void beginTx(OpData& myopd, const int tid) {
        tl_is_read_only = true;
        // Clear the logs of the previous transaction
        deleteAllocsFromLog(myopd);
        myopd.numRetires = 0;
        while (true) {
            myopd.curTx = curTx.load(std::memory_order_acquire);
            helpApply(myopd.curTx, tid);
            // Reset the write-set after (possibly) helping another transaction complete
            writeSets[tid].numStores = 0;
            // Use HE to protect the objects we're going to access during the simulation
            he.set(myopd.curTx, tid);
            // Start over if there is already a new transaction
            if (myopd.curTx == curTx.load(std::memory_order_acquire)) return;
        }
    }

    // Progress condition: wait-free population-oblivious
    // Attempts to publish our write-set (commit the transaction) and then applies the write-set.
    // Returns true if my transaction was committed.
    inline bool commitTx(OpData& myopd, const int tid) {
        // If it's a read-only transaction, then commit immediately
        if (writeSets[tid].numStores == 0 && myopd.numRetires == 0) return true;
        // Give up if the currTx has changed sinced our transaction started
        if (myopd.curTx != curTx.load(std::memory_order_acquire)) return false;
        // Move our request to OPEN, using the sequence of the previous transaction +1
        uint64_t seq = trans2seq(myopd.curTx);
        uint64_t newTx = seqidx2trans(seq+1,tid);
        myopd.request.store(newTx, std::memory_order_release);
        // Attempt to CAS currTx to our OpDesc instance (tid) incrementing the seq in it
        uint64_t lcurrTx = myopd.curTx;
        if (debug) printf("tid=%i  attempting CAS on curTx from (%ld,%ld) to (%ld,%ld)\n", tid, trans2seq(lcurrTx), trans2idx(lcurrTx), seq+1, (uint64_t)tid);
        if (!curTx.compare_exchange_strong(lcurrTx, newTx)) return false;
        // Execute each store in the write-set using DCAS() and close the request
        helpApply(newTx, tid);
        retireRetiresFromLog(myopd, tid);
        myopd.numAllocs = 0;
        if (debug) printf("Committed transaction (%ld,%ld) with %ld stores\n", seq+1, (uint64_t)tid, writeSets[tid].numStores);
        return true;
    }

    // Same as beginTx/endTx transaction, but with lambdas, and it handles AbortedTx exceptions
    template<typename R, typename F> R transaction(F&& func) {
        const int tid = ThreadRegistry::getTID();
        OpData& myopd = opData[tid];
        if (myopd.nestedTrans > 0) return func();
        ++myopd.nestedTrans;
        tl_opdata = &myopd;
        R retval {};
        while (true) {
            beginTx(myopd, tid);
            try {
                retval = func();
            } catch (AbortedTx&) {
                continue;
            }
            if (commitTx(myopd, tid)) break;
        }
        tl_opdata = nullptr;
        --myopd.nestedTrans;
        he.clear(tid);
        return retval;
    }

    // Same as above, but returns void
    template<typename F> void transaction(F&& func) {
        const int tid = ThreadRegistry::getTID();
        OpData& myopd = opData[tid];
        if (myopd.nestedTrans > 0) {
            func();
            return;
        }
        ++myopd.nestedTrans;
        tl_opdata = &myopd;
        while (true) {
            beginTx(myopd, tid);
            try {
                func();
            } catch (AbortedTx&) {
                continue;
            }
            if (commitTx(myopd, tid)) break;
        }
        tl_opdata = nullptr;
        --myopd.nestedTrans;
        he.clear(tid);
    }

    // It's silly that these have to be static, but we need them for the (SPS) benchmarks due to templatization
    template<typename R, typename F> static R updateTx(F&& func) { return gOFLF.transaction<R>(func); }
    template<typename R, typename F> static R readTx(F&& func) { return gOFLF.transaction<R>(func); }
    template<typename F> static void updateTx(F&& func) { gOFLF.transaction(func); }
    template<typename F> static void readTx(F&& func) { gOFLF.transaction(func); }

    // When inside a transaction, the user can't call "new" directly because if
    // the transaction fails, it would leak the memory of these allocations.
    // Instead, we provide an allocator that keeps pointers to these objects
    // in a log, and in the event of a failed commit of the transaction, it will
    // delete the objects so that there are no leaks.
    // TODO: Add static_assert to check if T is of tmbase
    template <typename T, typename... Args> static T* tmNew(Args&&... args) {
        T* ptr = (T*)std::malloc(sizeof(T));
        new (ptr) T(std::forward<Args>(args)...);  // new placement
        ptr->newEra_ = trans2seq(gOFLF.curTx.load(std::memory_order_acquire));
        OpData* myopd = tl_opdata;
        if (myopd != nullptr) {
            assert(myopd->numAllocs != TX_MAX_ALLOCS);
            Deletable& del = myopd->alog[myopd->numAllocs++];
            del.obj = ptr;
            // This func ptr to a lambda gives us a way to call the destructor
            // when a transaction aborts.
            del.reclaim = [](void* obj) { static_cast<T*>(obj)->~T(); std::free(obj); };
        }
        return ptr;
    }

    // The user can not directly delete objects in the transaction because the
    // transaction may fail and needs to be retried and other threads may be
    // using those objects.
    // Instead, it has to call retire() for the objects it intends to delete.
    // The retire() puts the objects in the rlog, and only when the transaction
    // commits, the objects are put in the Hazard Eras retired list.
    // The del.delEra is filled in retireRetiresFromLog().
    // TODO: Add static_assert to check if T is of tmbase
    template<typename T> static void tmDelete(T* obj) {
        if (obj == nullptr) return;
        obj->~T(); // Execute destructor as part of the current transaction
        OpData* myopd = tl_opdata;
        if (myopd == nullptr) {
            std::free(obj);  // Outside a transaction, just delete the object
            return;
        }
        assert(myopd->numRetires != TX_MAX_RETIRES);
        myopd->rlog[myopd->numRetires++] = obj;
    }

    // We snap a tmbase at the beginning of the allocation
    static void* tmMalloc(size_t size) {
        uint8_t* ptr = (uint8_t*)std::malloc(size+sizeof(tmbase));
        // We must reset the contents to zero to guarantee that if any tmtypes are allocated inside, their 'seq' will be zero
        std::memset(ptr+sizeof(tmbase), 0, size);
        ((tmbase*)ptr)->newEra_ = trans2seq(gOFLF.curTx.load(std::memory_order_acquire));
        OpData* myopd = tl_opdata;
        if (myopd != nullptr) {
            assert(myopd->numAllocs != TX_MAX_ALLOCS);
            Deletable& del = myopd->alog[myopd->numAllocs++];
            del.obj = ptr;
            del.reclaim = [](void* obj) { std::free(obj); };
        }
        return ptr + sizeof(tmbase);
    }

    // We assume there is a tmbase allocated in the beginning of the allocation
    static void tmFree(void* obj) {
        if (obj == nullptr) return;
        OpData* myopd = tl_opdata;
        uint8_t* ptr = (uint8_t*)obj - sizeof(tmbase);
        if (myopd == nullptr) {
            std::free(ptr);  // Outside a transaction, just free the object
            return;
        }
        assert(myopd->numRetires != TX_MAX_RETIRES);
        myopd->rlog[myopd->numRetires++] = (tmbase*)ptr;
    }

private:
    // Progress condition: wait-free population oblivious
    void helpApply(uint64_t lcurTx, const uint64_t tid) {
        const uint64_t idx = trans2idx(lcurTx);
        const uint64_t seq = trans2seq(lcurTx);
        OpData& opd = opData[idx];
        // Nothing to apply unless the request matches the curTx
        if (lcurTx != opd.request.load(std::memory_order_acquire)) return;
        if (idx != tid) {
            // Make a copy of the write-set and check if it is consistent
            writeSets[tid] = writeSets[idx];
            // Use HE to protect the objects the transaction touches
            he.set(lcurTx, tid);
            if (lcurTx != curTx.load()) return;
            // The published era is now protecting all objects alive in the transaction lcurTx
            if (lcurTx != opd.request.load(std::memory_order_acquire)) return;
        }
        if (debug) printf("Applying %ld stores in write-set\n", writeSets[tid].numStores);
        writeSets[tid].apply(seq, tid);
        const uint64_t newReq = seqidx2trans(seq+1,idx);
        if (idx == tid) {
            opd.request.store(newReq, std::memory_order_release);
        } else {
            if (opd.request.load(std::memory_order_acquire) == lcurTx) {
                opd.request.compare_exchange_strong(lcurTx, newReq);
            }
        }
    }

    // This is called when the transaction fails, to undo all the allocations done during the transaction
     void deleteAllocsFromLog(OpData& myopd) {
        for (unsigned i = 0; i < myopd.numAllocs; i++) {
            myopd.alog[i].reclaim(myopd.alog[i].obj);
        }
        myopd.numAllocs = 0;
    }

    // My transaction was successful, it's my duty to cleanup any retired objects.
    // This is called by the owner thread when the transaction succeeds, to pass
    // the retired objects to Hazard Eras. We can't delete the objects
    // immediately because there might be other threads trying to apply our log
    // which may (or may not) contain addresses inside the objects in this list.
    void retireRetiresFromLog(OpData& myopd, const int tid) {
        uint64_t lseq = trans2seq(curTx.load(std::memory_order_acquire));
        // First, add all the objects to the list of retired/zombies
        for (unsigned i = 0; i < myopd.numRetires; i++) {
            myopd.rlog[i]->delEra_ = lseq;
            he.addToRetiredList(myopd.rlog[i], tid);
        }
        // Second, start a cleaning phase, scanning to see which objects can be removed
        he.clean(lseq, tid);
        myopd.numRetires = 0;
    }
};


// T is typically a pointer to a node, but it can be integers or other stuff, as long as it fits in 64 bits
template<typename T> struct tmtype : tmtypebase<T> {

    tmtype() { }

    tmtype(T initVal) { isolated_store(initVal); }

    // Casting operator
    operator T() { return pload(); }

    // Prefix increment operator: ++x
    void operator++ () { pstore(pload()+1); }
    // Prefix decrement operator: --x
    void operator-- () { pstore(pload()-1); }
    void operator++ (int) { pstore(pload()+1); }
    void operator-- (int) { pstore(pload()-1); }

    // Equals operator: first downcast to T and then compare
    bool operator == (const T& otherval) const { return pload() == otherval; }

    // Difference operator: first downcast to T and then compare
    bool operator != (const T& otherval) const { return pload() != otherval; }

    // Relational operators
    bool operator < (const T& rhs) { return pload() < rhs; }
    bool operator > (const T& rhs) { return pload() > rhs; }
    bool operator <= (const T& rhs) { return pload() <= rhs; }
    bool operator >= (const T& rhs) { return pload() >= rhs; }

    // Operator arrow ->
    T operator->() { return pload(); }

    // Copy constructor
    tmtype<T>(const tmtype<T>& other) { pstore(other.pload()); }

    // Assignment operator from an tmtype
    tmtype<T>& operator=(const tmtype<T>& other) {
        pstore(other.pload());
        return *this;
    }

    // Assignment operator from a value
    tmtype<T>& operator=(T value) {
        pstore(value);
        return *this;
    }

    // Operator &
    T* operator&() {
        return (T*)this;
    }

    // Meant to be called when know we're the only ones touching
    // these contents, for example, in the constructor of an object, before
    // making the object visible to other threads.
    inline void isolated_store(T newVal) {
        tmtypebase<T>::val.store((uint64_t)newVal, std::memory_order_relaxed);
    }

    // We don't need to check currTx here because we're not de-referencing
    // the val. It's only after a load() that the val may be de-referenced
    // (in user code), therefore we do the check on load() only.
    inline void pstore(T newVal) {
        OpData* const myopd = tl_opdata;
        if (myopd == nullptr) { // Looks like we're outside a transaction
            tmtypebase<T>::val.store((uint64_t)newVal, std::memory_order_relaxed);
        } else {
            gOFLF.writeSets[tl_tcico.tid].addOrReplace(this, (uint64_t)newVal);
        }
    }

    // We have to check if there is a new ongoing transaction and if so, abort
    // this execution immediately for two reasons:
    // 1. Memory Reclamation: the val we're returning may be a pointer to an
    // object that has since been retired and deleted, therefore we can't allow
    // user code to de-reference it;
    // 2. Invariant Conservation: The val we're reading may be from a newer
    // transaction, which implies that it may break an invariant in the user code.
    // See examples of invariant breaking in this post:
    // http://concurrencyfreaks.com/2013/11/stampedlocktryoptimisticread-and.html
    inline T pload() const {
        T lval = (T)tmtypebase<T>::val.load(std::memory_order_acquire);
        if (tl_opdata == nullptr) return lval;
        uint64_t lseq = tmtypebase<T>::seq.load(std::memory_order_acquire);
        if (lseq > trans2seq(tl_opdata->curTx)) throw AbortedTxException;
        if (tl_is_read_only) return lval;
        return (T)gOFLF.writeSets[tl_tcico.tid].lookupAddr(this, (uint64_t)lval);
    }
};


//
// Wrapper methods to the global TM instance. The user should use these:
//
template<typename R, typename F> static R updateTx(F&& func) { return gOFLF.transaction<R>(func); }
template<typename R, typename F> static R readTx(F&& func) { return gOFLF.transaction<R>(func); }
template<typename F> static void updateTx(F&& func) { gOFLF.transaction(func); }
template<typename F> static void readTx(F&& func) { gOFLF.transaction(func); }
template<typename T, typename... Args> T* tmNew(Args&&... args) { return OneFileLF::tmNew<T>(args...); }
template<typename T> void tmDelete(T* obj) { OneFileLF::tmDelete<T>(obj); }
inline void* tmMalloc(size_t size) { return OneFileLF::tmMalloc(size); }
inline void tmFree(void* obj) { OneFileLF::tmFree(obj); }


//
// Place these in a .cpp if you include this header from different files (compilation units)
//
OneFileLF gOFLF {};
thread_local OpData* tl_opdata {nullptr};
// Global/singleton to hold all the thread registry functionality
ThreadRegistry gThreadRegistry {};
// During a transaction, this is true up until the first store()
thread_local bool tl_is_read_only {false};
// This is where every thread stores the tid it has been assigned when it calls getTID() for the first time.
// When the thread dies, the destructor of ThreadCheckInCheckOut will be called and de-register the thread.
thread_local ThreadCheckInCheckOut tl_tcico {};
// Helper function for thread de-registration
void thread_registry_deregister_thread(const int tid) {
    gThreadRegistry.deregister_thread(tid);
}


} // ... and all this with less than 800 lines of code  :)

#endif /* _ONE_FILE_LOCK_FREE_TRANSACTIONAL_MEMORY_WITH_HAZARD_ERAS_H_ */


================================================
FILE: stms/OneFileWF.hpp
================================================
/*
 * Copyright 2017-2019
 *   Andreia Correia <andreia.veiga@unine.ch>
 *   Pedro Ramalhete <pramalhe@gmail.com>
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Nachshon Cohen <nachshonc@gmail.com>
 *
 * This work is published under the MIT license. See LICENSE.txt
 */
#ifndef _ONE_FILE_WAIT_FREE_TRANSACTIONAL_MEMORY_WITH_HAZARD_ERAS_H_
#define _ONE_FILE_WAIT_FREE_TRANSACTIONAL_MEMORY_WITH_HAZARD_ERAS_H_

#include <atomic>
#include <cassert>
#include <iostream>
#include <vector>
#include <functional>
#include <cstring>

// Please keep this file in sync (as much as possible) with ptms/POneFileWF.hpp

namespace ofwf {

//
// User configurable variables.
// Feel free to change these if you need larger transactions, more allocations per transacation, or more threads.
//

// Maximum number of registered threads that can execute transactions
static const int REGISTRY_MAX_THREADS = 128;
// Maximum number of stores in the WriteSet per transaction
static const uint64_t TX_MAX_STORES = 40*1024;
// Number of buckets in the hashmap of the WriteSet.
static const uint64_t HASH_BUCKETS = 1024;
// Maximum number of allocations in one transaction
static const uint64_t TX_MAX_ALLOCS = 10*1024;
// Maximum number of deallocations in one transaction
static const uint64_t TX_MAX_RETIRES = 10*1024;


// DCAS / CAS2 macro
#define DCAS(ptr, o1, o2, n1, n2)                               \
({                                                              \
    char __ret;                                                 \
    __typeof__(o2) __junk;                                      \
    __typeof__(*(ptr)) __old1 = (o1);                           \
    __typeof__(o2) __old2 = (o2);                               \
    __typeof__(*(ptr)) __new1 = (n1);                           \
    __typeof__(o2) __new2 = (n2);                               \
    asm volatile("lock cmpxchg16b %2;setz %1"                   \
                   : "=d"(__junk), "=a"(__ret), "+m" (*ptr)     \
                   : "b"(__new1), "c"(__new2),                  \
                     "a"(__old1), "d"(__old2));                 \
    __ret; })


// Functions to convert between a transaction identifier (uint64_t) and a pair of {sequence,index}
static inline uint64_t seqidx2trans(uint64_t seq, uint64_t idx) {
    return (seq << 10) | idx;
}
static inline uint64_t trans2seq(uint64_t trans) {
    return trans >> 10;
}
static inline uint64_t trans2idx(uint64_t trans) {
    return trans & 0x3FF; // 10 bits
}


//
// Thread Registry stuff
//
extern void thread_registry_deregister_thread(const int tid);

// An helper class to do the checkin and checkout of the thread registry
struct ThreadCheckInCheckOut {
    static const int NOT_ASSIGNED = -1;
    int tid { NOT_ASSIGNED };
    ~ThreadCheckInCheckOut() {
        if (tid == NOT_ASSIGNED) return;
        thread_registry_deregister_thread(tid);
    }
};

extern thread_local ThreadCheckInCheckOut tl_tcico;

// Forward declaration of global/singleton instance
class ThreadRegistry;
extern ThreadRegistry gThreadRegistry;

/*
 * <h1> Registry for threads </h1>
 *
 * This is singleton type class that allows assignement of a unique id to each thread.
 * The first time a thread calls ThreadRegistry::getTID() it will allocate a free slot in 'usedTID[]'.
 * This tid wil be saved in a thread-local variable of the type ThreadCheckInCheckOut which
 * upon destruction of the thread will call the destructor of ThreadCheckInCheckOut and free the
 * corresponding slot to be used by a later thread.
 */
class ThreadRegistry {
private:
    alignas(128) std::atomic<bool>      usedTID[REGISTRY_MAX_THREADS];   // Which TIDs are in use by threads
    alignas(128) std::atomic<int>       maxTid {-1};                     // Highest TID (+1) in use by threads

public:
    ThreadRegistry() {
        for (int it = 0; it < REGISTRY_MAX_THREADS; it++) {
            usedTID[it].store(false, std::memory_order_relaxed);
        }
    }

    // Progress condition: wait-free bounded (by the number of threads)
    int register_thread_new(void) {
        for (int tid = 0; tid < REGISTRY_MAX_THREADS; tid++) {
            if (usedTID[tid].load(std::memory_order_acquire)) continue;
            bool unused = false;
            if (!usedTID[tid].compare_exchange_strong(unused, true)) continue;
            // Increase the current maximum to cover our thread id
            int curMax = maxTid.load();
            while (curMax <= tid) {
                maxTid.compare_exchange_strong(curMax, tid+1);
                curMax = maxTid.load();
            }
            tl_tcico.tid = tid;
            return tid;
        }
        std::cout << "ERROR: Too many threads, registry can only hold " << REGISTRY_MAX_THREADS << " threads\n";
        assert(false);
    }

    // Progress condition: wait-free population oblivious
    inline void deregister_thread(const int tid) {
        usedTID[tid].store(false, std::memory_order_release);
    }

    // Progress condition: wait-free population oblivious
    static inline uint64_t getMaxThreads(void) {
        return gThreadRegistry.maxTid.load(std::memory_order_acquire);
    }

    // Progress condition: wait-free bounded (by the number of threads)
    static inline int getTID(void) {
        int tid = tl_tcico.tid;
        if (tid != ThreadCheckInCheckOut::NOT_ASSIGNED) return tid;
        return gThreadRegistry.register_thread_new();
    }
};


// Each object tracked by Hazard Eras needs to have tmbase as one of its base classes.
struct tmbase {
    uint64_t newEra_ {0};        // Filled by tmNew() or tmMalloc()
    uint64_t delEra_ {0};        // Filled by tmDelete() or tmFree()
};


// One entry in the log of allocations (not used for retires like in the WF version).
// In case the transactions aborts, we can rollback our allocations, hiding the type information inside the lambda.
// Sure, we could keep everything in std::function, but this uses less memory.
struct Deletable {
    void* obj {nullptr};         // Pointer to object to be deleted
    void (*reclaim)(void*);      // A wrapper to keep the type of the underlying object
};


// A wrapper to std::function so that we can track it with Hazard Eras
struct TransFunc : public tmbase {
    std::function<uint64_t()> func;
    template<typename F> TransFunc(F&& f) : func{f} { }
};


// This is a specialized implementation of Hazard Eras meant to be used in the OneFile STM.
// Hazard Eras is a lock-free memory reclamation technique described here:
// https://github.com/pramalhe/ConcurrencyFreaks/blob/master/papers/hazarderas-2017.pdf
// https://dl.acm.org/citation.cfm?id=3087588
//
// We're using OF::curTx.seq as the global era.
//
// This implementation is different from the lock-free OneFile STM because we need
// to track the lifetime of the std::function objects where the lambdas are put.
class HazardErasOF {
private:
    static const uint64_t                    NOERA = 0;
    static const int                         CLPAD = 128/sizeof(std::atomic<uint64_t>);
    static const int                         THRESHOLD_R = 0; // This is named 'R' in the HP paper
    const unsigned int                       maxThreads;
    alignas(128) std::atomic<uint64_t>*      he;
    // It's not nice that we have a lot of empty vectors, but we need padding to avoid false sharing
    alignas(128) std::vector<tmbase*>        retiredList[REGISTRY_MAX_THREADS*CLPAD];
    alignas(128) std::vector<TransFunc*>   retiredListTx[REGISTRY_MAX_THREADS*CLPAD];

public:
    HazardErasOF(unsigned int maxThreads=REGISTRY_MAX_THREADS) : maxThreads{maxThreads} {
        he = new std::atomic<uint64_t>[REGISTRY_MAX_THREADS*CLPAD];
        for (unsigned it = 0; it < REGISTRY_MAX_THREADS; it++) {
            he[it*CLPAD].store(NOERA, std::memory_order_relaxed);
            retiredList[it*CLPAD].reserve(REGISTRY_MAX_THREADS);  // We pre-reserve one object per thread, should be enough to start
            retiredListTx[it*CLPAD].reserve(REGISTRY_MAX_THREADS);
        }
    }

    ~HazardErasOF() {
        // Clear the objects in the retired lists
        for (unsigned it = 0; it < maxThreads; it++) {
            for (unsigned iret = 0; iret < retiredList[it*CLPAD].size(); iret++) {
                tmbase* del = retiredList[it*CLPAD][iret];
                std::free(del);
                // No need to call destructor because it was already executed as part of the transaction
            }
            for (unsigned iret = 0; iret < retiredListTx[it*CLPAD].size(); iret++) {
                TransFunc* tx = retiredListTx[it*CLPAD][iret];
                delete tx;
            }
        }
        delete[] he;
    }

    // Progress condition: wait-free population oblivious
    inline void clear(const int tid) {
        he[tid*CLPAD].store(NOERA, std::memory_order_release);
    }

    // Progress condition: wait-free population oblivious
    inline void set(uint64_t trans, const int tid) {
        he[tid*CLPAD].store(trans2seq(trans));
    }

    // Progress condition: wait-free population oblivious
    inline void addToRetiredList(tmbase* newdel, const int tid) {
        retiredList[tid*CLPAD].push_back(newdel);
    }

    // Progress condition: wait-free population oblivious
    inline void addToRetiredListTx(TransFunc* tx, const int tid) {
        retiredListTx[tid*CLPAD].push_back(tx);
    }

    /**
     * Progress condition: bounded wait-free
     *
     * Attemps to delete the no-longer-in-use objects in the retired list.
     * We need to pass the currEra coming from the seq of the currTx so that
     * the objects from the current transaction don't get deleted.
     *
     * TODO: consider using erase() with std::remove_if()
     */
    void clean(uint64_t curEra, const int tid) {
        if (retiredList[tid*CLPAD].size() < THRESHOLD_R) return;
        for (unsigned iret = 0; iret < retiredList[tid*CLPAD].size();) {
            tmbase* del = retiredList[tid*CLPAD][iret];
            if (canDelete(curEra, del)) {
                retiredList[tid*CLPAD].erase(retiredList[tid*CLPAD].begin() + iret);
                std::free(del);
                // No need to call destructor because it was executed as part of the transaction
                continue;
            }
            iret++;
        }
        for (unsigned iret = 0; iret < retiredListTx[tid*CLPAD].size();) {
            TransFunc* tx = retiredListTx[tid*CLPAD][iret];
            if (canDelete(curEra, tx)) {
                retiredListTx[tid*CLPAD].erase(retiredListTx[tid*CLPAD].begin() + iret);
                delete tx;
                continue;
            }
            iret++;
        }
    }

    // Progress condition: wait-free bounded (by the number of threads)
    inline bool canDelete(uint64_t curEra, tmbase* del) {
        // We can't delete objects from the current transaction
        if (del->delEra_ == curEra) return false;
        for (unsigned it = 0; it < ThreadRegistry::getMaxThreads(); it++) {
            const auto era = he[it*CLPAD].load(std::memory_order_acquire);
            if (era == NOERA || era < del->newEra_ || era > del->delEra_) continue;
            return false;
        }
        return true;
    }
};


// T is typically a pointer to a node, but it can be integers or other stuff, as long as it fits in 64 bits
template<typename T> struct tmtype {
    // Stores the actual value as an atomic
    alignas(16) std::atomic<uint64_t>  val;
    // Lets hope this comes immediately after 'val' in memory mapping, otherwise the DCAS() will fail
    alignas(8)  std::atomic<uint64_t>  seq {1};

    tmtype() { }

    tmtype(T initVal) { isolated_store(initVal); }

    // Casting operator
    operator T() { return pload(); }

    // Prefix increment operator: ++x
    void operator++ () { pstore(pload()+1); }
    // Prefix decrement operator: --x
    void operator-- () { pstore(pload()-1); }
    void operator++ (int) { pstore(pload()+1); }
    void operator-- (int) { pstore(pload()-1); }

    // Equals operator: first downcast to T and then compare
    bool operator == (const T& otherval) const { return pload() == otherval; }

    // Difference operator: first downcast to T and then compare
    bool operator != (const T& otherval) const { return pload() != otherval; }

    // Relational operators
    bool operator < (const T& rhs) { return pload() < rhs; }
    bool operator > (const T& rhs) { return pload() > rhs; }
    bool operator <= (const T& rhs) { return pload() <= rhs; }
    bool operator >= (const T& rhs) { return pload() >= rhs; }

    // Operator arrow ->
    T operator->() { return pload(); }

    // Copy constructor
    tmtype<T>(const tmtype<T>& other) { pstore(other.pload()); }

    // Assignment operator from an tmtype
    tmtype<T>& operator=(const tmtype<T>& other) {
        pstore(other.pload());
        return *this;
    }

    // Assignment operator from a value
    tmtype<T>& operator=(T value) {
        pstore(value);
        return *this;
    }

    // Operator &
    T* operator&() {
        return (T*)this;
    }

    // Meant to be called when know we're the only ones touching
    // these contents, for example, in the constructor of an object, before
    // making the object visible to other threads.
    inline void isolated_store(T newVal) {
        val.store((uint64_t)newVal, std::memory_order_relaxed);
    }

    // Used only internally to initialize the operations[] array
    inline void operationsInit() {
        val.store((uint64_t)nullptr, std::memory_order_relaxed);
        seq.store(0, std::memory_order_relaxed);
    }

    // Used only internally to initialize the results[] array
    inline void resultsInit() {
        val.store(0, std::memory_order_relaxed);
        seq.store(1, std::memory_order_relaxed);
    }

    // Used only internally by updateTx() to determine if the request is opened or not
    inline uint64_t getSeq() const {
        return seq.load(std::memory_order_acquire);
    }

    // Used only internally by updateTx()
    inline void rawStore(T& newVal, uint64_t lseq) {
        val.store((uint64_t)newVal, std::memory_order_relaxed);
        seq.store(lseq, std::memory_order_release);
    }

    // Methods that are defined later because they have compilation dependencies on gOFWF
    inline T pload() const;
    inline bool rawLoad(T& keepVal, uint64_t& keepSeq);
    inline void pstore(T newVal);
};


// A single entry in the write-set
struct WriteSetEntry {
    void*          addr {nullptr};  // Address of value+sequence to change
    uint64_t       val;             // Desired value to change to
    WriteSetEntry* next {nullptr};  // Pointer to next node in the (intrusive) hash map
};

extern thread_local bool tl_is_read_only;


// The write-set is a log of the words modified during the transaction.
// This log is an array with an intrusive hashmap of size HASH_BUCKETS.
struct WriteSet {
    static const uint64_t MAX_ARRAY_LOOKUP = 30;  // Beyond this, it seems to be faster to use the hashmap
    WriteSetEntry*        buckets[HASH_BUCKETS];  // Intrusive HashMap for fast lookup in large(r) transactions
    uint64_t              numStores {0};          // Number of stores in the writeSet for the current transaction
    WriteSetEntry         log[TX_MAX_STORES];     // Redo log of stores

    WriteSet() {
        numStores = 0;
        for (unsigned i = 0; i < HASH_BUCKETS; i++) buckets[i] = &log[TX_MAX_STORES-1];
    }

    // Each address on a different bucket
    inline uint64_t hash(const void* addr) const {
        return (((uint64_t)addr) >> 3) % HASH_BUCKETS;
    }

    // Adds a modification to the redo log
    inline void addOrReplace(void* addr, uint64_t val) {
        if (tl_is_read_only) tl_is_read_only = false;
        const uint64_t hashAddr = hash(addr);
        if (numStores < MAX_ARRAY_LOOKUP) {
            // Lookup in array
            for (unsigned int idx = 0; idx < numStores; idx++) {
                if (log[idx].addr == addr) {
                    log[idx].val = val;
                    return;
                }
            }
        } else {
            // Lookup in hashmap
            WriteSetEntry* be = buckets[hashAddr];
            if (be < &log[numStores] && hash(be->addr) == hashAddr) {
                while (be != nullptr) {
                    if (be->addr == addr) {
                        be->val = val;
                        return;
                    }
                    be = be->next;
                }
            }
        }
        // Add to array
        WriteSetEntry* e = &log[numStores++];
        assert(numStores < TX_MAX_STORES);
        e->addr = addr;
        e->val = val;
        // Add to hashmap
        WriteSetEntry* be = buckets[hashAddr];
        // Clear if entry is from previous tx
        e->next = (be < e && hash(be->addr) == hashAddr) ? be : nullptr;
        buckets[hashAddr] = e;
    }

    // Does a lookup on the WriteSet for an addr.
    // If the numStores is lower than MAX_ARRAY_LOOKUP, the lookup is done on the log, otherwise, the lookup is done on the hashmap.
    // If it's not in the write-set, return lval.
    inline uint64_t lookupAddr(const void* addr, uint64_t lval) {
        if (numStores < MAX_ARRAY_LOOKUP) {
            // Lookup in array
            for (unsigned int idx = 0; idx < numStores; idx++) {
                if (log[idx].addr == addr) return log[idx].val;
            }
        } else {
            // Lookup in hashmap
            const uint64_t hashAddr = hash(addr);
            WriteSetEntry* be = buckets[hashAddr];
            if (be < &log[numStores] && hash(be->addr) == hashAddr) {
                while (be != nullptr) {
                    if (be->addr == addr) return be->val;
                    be = be->next;
                }
            }
        }
        return lval;
    }

    // Assignment operator, used when making a copy of a WriteSet to help another thread
    WriteSet& operator = (const WriteSet &other) {
        numStores = other.numStores;
        for (uint64_t i = 0; i < numStores; i++) log[i] = other.log[i];
        return *this;
    }

    // Applies all entries in the log as DCASes.
    // Seq must match for DCAS to succeed. This method is on the "hot-path".
    inline void apply(uint64_t seq, const int tid) {
        for (uint64_t i = 0; i < numStores; i++) {
            // Use an heuristic to give each thread 8 consecutive DCAS to apply
            WriteSetEntry& e = log[(tid*8 + i) % numStores];
            tmtype<uint64_t>* tmte = (tmtype<uint64_t>*)e.addr;
            uint64_t lval = tmte->val.load(std::memory_order_acquire);
            uint64_t lseq = tmte->seq.load(std::memory_order_acquire);
            if (lseq < seq) DCAS((uint64_t*)e.addr, lval, lseq, e.val, seq);
        }
    }
};


// Forward declaration
struct OpData;
// This is used by addOrReplace() to know which OpData instance to use for the current transaction
extern thread_local OpData* tl_opdata;


// ts purpose is to hold thread-local data
struct OpData {
    uint64_t              curTx {0};                   // Used during a transaction to keep the value of currTx read in beginTx() (owner thread only)
    std::atomic<uint64_t> request {0};                 // Can be moved to CLOSED by other threads, using a CAS
    uint64_t              nestedTrans {0};             // Thread-local: Number of nested transactions
    uint64_t              numRetires {0};              // Number of calls to retire() in this transaction (owner thread only)
    tmbase*               rlog[TX_MAX_RETIRES];        // List of retired objects during the transaction (owner thread only)
    uint64_t              numAllocs {0};               // Number of calls to tmNew() in this transaction (owner thread only)
    Deletable             alog[TX_MAX_ALLOCS];         // List of newly allocated objects during the transaction (owner thread only)
};


// Used to identify aborted transactions
struct AbortedTx {};
static constexpr AbortedTx AbortedTxException {};

class OneFileWF;
extern OneFileWF gOFWF;


/**
 * <h1> OneFile STM (Wait-Free) </h1>
 *
 * OneFile is a Software Transacional Memory with wait-free progress, meant to
 * implement wait-free data structures. It has integrated wait-free memory
 * reclamation using Hazard Eras: https://dl.acm.org/citation.cfm?id=3087588
 *
 * OneFile is a word-based STM and it uses double-compare-and-swap (DCAS).
 *
 * Right now it has several limitations, some will be fixed in the future, some may be hard limitations of this approach:
 * - We can't have stack allocated tmtype<> variables. For example, we can't created inside a transaction "tmtpye<uint64_t> tmp = a;",
 *   it will give weird errors because of stack allocation.
 * - We need DCAS but it can be emulated with LL/SC or even with single-word CAS
 *   if we do redirection to a (lock-free) pool with SeqPtrs;
 */
class OneFileWF {
private:
    static const bool                    debug = false;
    HazardErasOF                         he {REGISTRY_MAX_THREADS};
    OpData                              *opData;
    // Maximum number of times a reader will fail a transaction before turning into an updateTx()
    static const int                     MAX_READ_TRIES = 4;
    // Member variables for wait-free consensus
    tmtype<TransFunc*>*                  operations;  // We've tried adding padding here but it didn't make a difference
    tmtype<uint64_t>*                    results;
public:
    std::atomic<uint64_t>                pad0[16];  // two cache lines of padding, before and after curTx
    std::atomic<uint64_t>                curTx {seqidx2trans(1,0)};
    std::atomic<uint64_t>                pad1[15];
    WriteSet                            *writeSets;                    // Two write-sets for each thread

    OneFileWF() {
        opData = new OpData[REGISTRY_MAX_THREADS];
        writeSets = new WriteSet[REGISTRY_MAX_THREADS];
        operations = new tmtype<TransFunc*>[REGISTRY_MAX_THREADS];
        for (unsigned i = 0; i < REGISTRY_MAX_THREADS; i++) operations[i].operationsInit();
        results = new tmtype<uint64_t>[REGISTRY_MAX_THREADS];
        for (unsigned i = 0; i < REGISTRY_MAX_THREADS; i++) results[i].resultsInit();
    }

    ~OneFileWF() {
        delete[] opData;
        delete[] writeSets;
        delete[] operations;
        delete[] results;
    }

    static std::string className() { return "OneFileSTM-WF"; }

    // Progress condition: wait-free population-oblivious
    // Attempts to publish our write-set (commit the transaction) and then applies the write-set.
    // Returns true if my transaction was committed.
    inline bool commitTx(OpData& myopd, const int tid) {
        // If it's a read-only transaction, then commit immediately
        if (writeSets[tid].numStores == 0 && myopd.numRetires == 0) return true;
        // Give up if the curTx has changed sinced our transaction started
        if (myopd.curTx != curTx.load(std::memory_order_acquire)) return false;
        // Move our request to OPEN, using the sequence of the previous transaction +1
        uint64_t seq = trans2seq(myopd.curTx);
        uint64_t newTx = seqidx2trans(seq+1,tid);
        myopd.request.store(newTx, std::memory_order_release);
        // Attempt to CAS curTx to our OpData instance (tid) incrementing the seq in it
        uint64_t lcurTx = myopd.curTx;
        if (debug) printf("tid=%i  attempting CAS on curTx from (%ld,%ld) to (%ld,%ld)\n", tid, trans2seq(lcurTx), trans2idx(lcurTx), seq+1, (uint64_t)tid);
        if (!curTx.compare_exchange_strong(lcurTx, newTx)) return false;
        // Execute each store in the write-set using DCAS() and close the request
        helpApply(newTx, tid);
        retireRetiresFromLog(myopd, tid);
        myopd.numAllocs = 0;
        if (debug) printf("Committed transaction (%ld,%ld) with %ld stores\n", seq+1, (uint64_t)tid, writeSets[tid].numStores);
        return true;
    }

    // Progress condition: wait-free (bounded by the number of threads)
    // Applies a mutative transaction or gets another thread with an ongoing
    // transaction to apply it.
    // If three 'seq' have passed since the transaction when we published our
    // function, then the worst-case scenario is: the first transaction does not
    // see our function; the second transaction transforms our function
    // but doesn't apply the corresponding write-set; the third transaction
    // guarantees that the log of the second transaction is applied.
    inline void innerUpdateTx(OpData& myopd, TransFunc* funcptr, const int tid) {
        ++myopd.nestedTrans;
        if (debug) printf("updateTx(tid=%d)\n", tid);
        // We need an era from before the 'funcptr' is announced, so as to protect it
        uint64_t firstEra = trans2seq(curTx.load(std::memory_order_acquire));
        operations[tid].rawStore(funcptr, results[tid].getSeq());
        tl_opdata = &myopd;
        // Check 3x for the completion of our operation because we don't have a fence
        // on operations[tid].rawStore(), otherwise it would be just 2x.
        for (int iter = 0; iter < 4; iter++) {
            // An update transaction is read-only until it does the first store()
            tl_is_read_only = true;
            // Clear the logs of the previous transaction
            deleteAllocsFromLog(myopd);
            writeSets[tid].numStores = 0;
            myopd.numRetires = 0;
            myopd.curTx = curTx.load(std::memory_order_acquire);
            // Optimization: if my request is answered, then my tx is committed
            if (results[tid].getSeq() > operations[tid].getSeq()) break;
            helpApply(myopd.curTx, tid);
            // Reset the write-set after (possibly) helping another transaction complete
            writeSets[tid].numStores = 0;
            // Use HE to protect the objects we're going to access during the transform phase
            he.set(myopd.curTx, tid);
            if (myopd.curTx != curTx.load()) continue;
            try {
                if (!transformAll(myopd.curTx, tid)) continue;
            } catch (AbortedTx&) {
                continue;
            }
            if (commitTx(myopd, tid)) break;
        }
        deleteAllocsFromLog(myopd);
        tl_opdata = nullptr;
        --myopd.nestedTrans;
        he.clear(tid);
        retireMyFunc(tid, funcptr, firstEra);
    }

    // Update transaction with non-void return value
    template<typename R, class F> static R updateTx(F&& func) {
        const int tid = ThreadRegistry::getTID();
        OpData& myopd = gOFWF.opData[tid];
        if (myopd.nestedTrans > 0) return func();
        // Copy the lambda to a std::function<> and announce a request with the pointer to it
        gOFWF.innerUpdateTx(myopd, new TransFunc([func] () { return (uint64_t)func(); }), tid);
        return (R)gOFWF.results[tid].pload();
    }

    // Update transaction with void return value
    template<class F> static void updateTx(F&& func) {
        const int tid = ThreadRegistry::getTID();
        OpData& myopd = gOFWF.opData[tid];
        if (myopd.nestedTrans > 0) {
            func();
            return;
        }
        // Copy the lambda to a std::function<> and announce a request with the pointer to it
        gOFWF.innerUpdateTx(myopd, new TransFunc([func] () { func(); return 0; }), tid);
    }

    // Progress condition: wait-free (bounded by the number of threads + MAX_READ_TRIES)
    template<typename R, class F> R readTransaction(F&& func) {
        const int tid = ThreadRegistry::getTID();
        OpData& myopd = opData[tid];
        if (myopd.nestedTrans > 0) return func();
        ++myopd.nestedTrans;
        tl_opdata = &myopd;
        tl_is_read_only = true;
        if (debug) printf("readTx(tid=%d)\n", tid);
        R retval {};
        writeSets[tid].numStores = 0;
        myopd.numAllocs = 0;
        myopd.numRetires = 0;
        for (int iter = 0; iter < MAX_READ_TRIES; iter++) {
            myopd.curTx = curTx.load(std::memory_order_acquire);
            helpApply(myopd.curTx, tid);
            // Use HE to protect the objects we're going to access during the simulation
            he.set(myopd.curTx, tid);
            // Reset the write-set after (possibly) helping another transaction complete
            writeSets[tid].numStores = 0;
            if (myopd.curTx != curTx.load()) continue;
            try {
                retval = func();
            } catch (AbortedTx&) {
                continue;
            }
            --myopd.nestedTrans;
            tl_opdata = nullptr;
            he.clear(tid);
            return retval;
        }
        if (debug) printf("readTx() executed MAX_READ_TRIES, posing as updateTx()\n");
        --myopd.nestedTrans;
        // Tried too many times unsucessfully, pose as an updateTx()
        return updateTx<R>(func);
    }

    template<typename R, typename F> static R readTx(F&& func) { return gOFWF.readTransaction<R>(func); }
    //template<typename F> static void readTx(F&& func) { gOFWF.readTransaction(func); }

    // When inside a transaction, the user can't call "new" directly because if
    // the transaction fails, it would leak the memory of these allocations.
    // Instead, we provide an allocator that keeps pointers to these objects
    // in a log, and in the event of a failed commit of the transaction, it will
    // delete the objects so that there are no leaks.
    // TODO: Add static_assert to check if T is of tmbase
    template <typename T, typename... Args> static T* tmNew(Args&&... args) {
        T* ptr = (T*)std::malloc(sizeof(T));
        new (ptr) T(std::forward<Args>(args)...);  // new placement
        ptr->newEra_ = trans2seq(gOFWF.curTx.load(std::memory_order_acquire));
        OpData* myopd = tl_opdata;
        if (myopd != nullptr) {
            assert(myopd->numAllocs != TX_MAX_ALLOCS);
            Deletable& del = myopd->alog[myopd->numAllocs++];
            del.obj = ptr;
            // This func ptr to a lambda gives us a way to call the destructor
            // when a transaction aborts.
            del.reclaim = [](void* obj) { static_cast<T*>(obj)->~T(); std::free(obj); };
        }
        return ptr;
    }

    // The user can not directly delete objects in the transaction because the
    // transaction may fail and needs to be retried and other threads may be
    // using those objects.
    // Instead, it has to call retire() for the objects it intends to delete.
    // The retire() puts the objects in the rlog, and only when the transaction
    // commits, the objects are put in the Hazard Eras retired list.
    // The del.delEra is filled in retireRetiresFromLog().
    // TODO: Add static_assert to check if T is of tmbase
    template<typename T> static void tmDelete(T* obj) {
        if (obj == nullptr) return;
        obj->~T(); // Execute destructor as part of the current transaction
        OpData* myopd = tl_opdata;
        if (myopd == nullptr) {
            std::free(obj);  // Outside a transaction, just delete the object
            return;
        }
        assert(myopd->numRetires != TX_MAX_RETIRES);
        myopd->rlog[myopd->numRetires++] = obj;
    }

    // We snap a tmbase at the beginning of the allocation
    static void* tmMalloc(size_t size) {
        uint8_t* ptr = (uint8_t*)std::malloc(size+sizeof(tmbase));
        // We must reset the contents to zero to guarantee that if any tmtypes are allocated inside, their 'seq' will be zero
        std::memset(ptr+sizeof(tmbase), 0, size);
        ((tmbase*)ptr)->newEra_ = trans2seq(gOFWF.curTx.load(std::memory_order_acquire));
        OpData* myopd = tl_opdata;
        if (myopd != nullptr) {
            assert(myopd->numAllocs != TX_MAX_ALLOCS);
            Deletable& del = myopd->alog[myopd->numAllocs++];
            del.obj = ptr;
            del.reclaim = [](void* obj) { std::free(obj); };
        }
        return ptr + sizeof(tmbase);
    }

    // We assume there is a tmbase allocated in the beginning of the allocation
    static void tmFree(void* obj) {
        if (obj == nullptr) return;
        OpData* myopd = tl_opdata;
        uint8_t* ptr = (uint8_t*)obj - sizeof(tmbase);
        if (myopd == nullptr) {
            std::free(ptr);  // Outside a transaction, just free the object
            return;
        }
        assert(myopd->numRetires != TX_MAX_RETIRES);
        myopd->rlog[myopd->numRetires++] = (tmbase*)ptr;
    }

private:
    // Progress condition: wait-free population oblivious
    inline void helpApply(uint64_t lcurTx, const uint64_t tid) {
        const uint64_t idx = trans2idx(lcurTx);
        const uint64_t seq = trans2seq(lcurTx);
        OpData& opd = opData[idx];
        // Nothing to apply unless the request matches the curTx
        if (lcurTx != opd.request.load(std::memory_order_acquire)) return;
        if (idx != tid) {
            // Make a copy of the write-set and check if it is consistent
            writeSets[tid] = writeSets[idx];
            // Use HE to protect the objects the transaction touches
            he.set(lcurTx, tid);
            if (lcurTx != curTx.load()) return;
            // The published era is now protecting all objects alive in the transaction lcurTx
            if (lcurTx != opd.request.load(std::memory_order_acquire)) return;
        }
        if (debug) printf("Applying %ld stores in write-set\n", writeSets[tid].numStores);
        writeSets[tid].apply(seq, tid);
        const uint64_t newReq = seqidx2trans(seq+1,idx);
        if (idx == tid) {
            opd.request.store(newReq, std::memory_order_release);
        } else {
            if (opd.request.load(std::memory_order_acquire) == lcurTx) {
                opd.request.compare_exchange_strong(lcurTx, newReq);
            }
        }
    }

    // This is called when the transaction fails, to undo all the allocations done during the transaction
     void deleteAllocsFromLog(OpData& myopd) {
        for (unsigned i = 0; i < myopd.numAllocs; i++) {
            myopd.alog[i].reclaim(myopd.alog[i].obj);
        }
        myopd.numAllocs = 0;
    }

    // My transaction was successful, it's my duty to cleanup any retired objects.
    // This is called by the owner thread when the transaction succeeds, to pass
    // the retired objects to Hazard Eras. We can't delete the objects
    // immediately because there might be other threads trying to apply our log
    // which may (or may not) contain addresses inside the objects in this list.
    void retireRetiresFromLog(OpData& myopd, const int tid) {
        uint64_t lseq = trans2seq(curTx.load(std::memory_order_acquire));
        // First, add all the objects to the list of retired/zombies
        for (unsigned i = 0; i < myopd.numRetires; i++) {
            myopd.rlog[i]->delEra_ = lseq;
            he.addToRetiredList(myopd.rlog[i], tid);
        }
        // Second, start a cleaning phase, scanning to see which objects can be removed
        he.clean(lseq, tid);
        myopd.numRetires = 0;
    }


    inline void retireMyFunc(const int tid, TransFunc* myfunc, uint64_t firstEra) {
        myfunc->newEra_ = firstEra;
        myfunc->delEra_ = trans2seq(curTx.load(std::memory_order_acquire))+1; // Do we really need the +1 ?
        he.addToRetiredListTx(myfunc, tid);
    }

    // Aggregate all the functions of the different thread's writeTransaction()
    // and transform them into to a single log (the current thread's log).
    // Returns true if all active TransFunc were transformed
    inline bool transformAll(const uint64_t lcurrTx, const int tid) {
        for (unsigned i = 0; i < ThreadRegistry::getMaxThreads(); i++) {
            // Check if the operation of thread i has been applied (has a matching result)
            TransFunc* txfunc;
            uint64_t res, operationsSeq, resultSeq;
            if (!operations[i].rawLoad(txfunc, operationsSeq)) continue;
            if (!results[i].rawLoad(res, resultSeq)) continue;
            if (resultSeq > operationsSeq) continue;
            // Operation has not yet been applied, check that transaction identifier has not changed
            if (lcurrTx != curTx.load(std::memory_order_acquire)) return false;
            // Apply the operation of thread i and save result in results[i],
            // with this store being part of the transaction itself.
            results[i] = txfunc->func();
        }
        return true;
    }
};


//
// Wrapper methods to the global TM instance. The user should use these:
//
template<typename R, typename F> static R updateTx(F&& func) { return gOFWF.updateTx<R>(func); }
template<typename R, typename F> static R readTx(F&& func) { return gOFWF.readTx<R>(func); }
template<typename F> static void updateTx(F&& func) { gOFWF.updateTx(func); }
template<typename F> static void readTx(F&& func) { gOFWF.readTx(func); }
template<typename T, typename... Args> T* tmNew(Args&&... args) { return OneFileWF::tmNew<T>(args...); }
template<typename T> void tmDelete(T* obj) { OneFileWF::tmDelete<T>(obj); }
inline void* tmMalloc(size_t size) { return OneFileWF::tmMalloc(size); }
inline void tmFree(void* obj) { OneFileWF::tmFree(obj); }


// We have to check if there is a new ongoing transaction and if so, abort
// this execution immediately for two reasons:
// 1. Memory Reclamation: the val we're returning may be a pointer to an
// object that has since been retired and deleted, therefore we can't allow
// user code to de-reference it;
// 2. Invariant Conservation: The val we're reading may be from a newer
// transaction, which implies that it may break an invariant in the user code.
// See examples of invariant breaking in this post:
// http://concurrencyfreaks.com/2013/11/stampedlocktryoptimisticread-and.html
template<typename T> inline T tmtype<T>::pload() const {
    T lval = (T)val.load(std::memory_order_acquire);
    OpData* const myopd = tl_opdata;
    if (myopd == nullptr) return lval;
    uint64_t lseq = seq.load(std::memory_order_acquire);
    if (lseq > trans2seq(myopd->curTx)) throw AbortedTxException;
    if (tl_is_read_only) return lval;
    return (T)gOFWF.writeSets[tl_tcico.tid].lookupAddr(this, (uint64_t)lval);
}

// This method is meant to be used by the internal consensus mechanism, not by the user.
// Returns true if the 'val' and 'seq' placed in 'keepVal' and 'keepSeq'
// are consistent, i.e. linearizabile. We need to use acquire-loads to keep
// order and re-check the 'seq' to make sure it corresponds to the 'val' we're returning.
template<typename T> inline bool tmtype<T>::rawLoad(T& keepVal, uint64_t& keepSeq) {
    keepSeq = seq.load(std::memory_order_acquire);
    keepVal = (T)val.load(std::memory_order_acquire);
    return (keepSeq == seq.load(std::memory_order_acquire));
}

// We don't need to check currTx here because we're not de-referencing
// the val. It's only after a load() that the val may be de-referenced
// (in user code), therefore we do the check on load() only.
template<typename T> inline void tmtype<T>::pstore(T newVal) {
    if (tl_opdata == nullptr) { // Looks like we're outside a transaction
        val.store((uint64_t)newVal, std::memory_order_relaxed);
    } else {
        gOFWF.writeSets[tl_tcico.tid].addOrReplace(this, (uint64_t)newVal);
    }
}


//
// Place these in a .cpp if you include this header from multiple files (compilation units)
//
OneFileWF gOFWF {};
thread_local OpData* tl_opdata {nullptr};
// Global/singleton to hold all the thread registry functionality
ThreadRegistry gThreadRegistry {};
// During a transaction, this is true up until the first store()
thread_local bool tl_is_read_only {false};
// This is where every thread stores the tid it has been assigned when it calls getTID() for the first time.
// When the thread dies, the destructor of ThreadCheckInCheckOut will be called and de-register the thread.
thread_local ThreadCheckInCheckOut tl_tcico {};
// Helper function for thread de-registration
void thread_registry_deregister_thread(const int tid) {
    gThreadRegistry.deregister_thread(tid);
}

} // wait-free transactions with wait-free memory reclamation, and it's all less than 1000 lines of code  :)
#endif /* _ONE_FILE_WAIT_FREE_TRANSACTIONAL_MEMORY_WITH_HAZARD_ERAS_H_ */


================================================
FILE: stms/TinySTM.hpp
================================================
/*
 * Copyright 2017-2018
 *   Andreia Correia <andreia.veiga@unine.ch>
 *   Pedro Ramalhete <pramalhe@gmail.com>
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Nachshon Cohen <nachshonc@gmail.com>
 *
 * This work is published under the MIT license. See LICENSE.txt
 */
#ifndef _TINY_STM_TRANSACTIONAL_MEMORY_WRAPPER_H_
#define _TINY_STM_TRANSACTIONAL_MEMORY_WRAPPER_H_

#include <atomic>
#include <cassert>
#include <iostream>
#include <vector>
#include <functional>


namespace tinystm {

// Compile with explicit calls to TinySTM
#include "tinystm/include/stm.h"
#include "tinystm/include/mod_mem.h"
#include "tinystm/include/mod_ab.h"


struct tmbase {
};


//
// Thread Registry stuff
//
extern void thread_registry_deregister_thread(const int tid);

// An helper class to do the checkin and checkout of the thread registry
struct ThreadCheckInCheckOut {
    static const int NOT_ASSIGNED = -1;
    int tid { NOT_ASSIGNED };
    ~ThreadCheckInCheckOut() {
        if (tid == NOT_ASSIGNED) return;
        thread_registry_deregister_thread(tid);
    }
};

extern thread_local ThreadCheckInCheckOut tl_gc_tcico;

// Forward declaration of global/singleton instance
class ThreadRegistry;
extern ThreadRegistry gThreadRegistry;

/*
 * <h1> Registry for threads </h1>
 *
 * This is singleton type class that allows assignement of a unique id to each thread.
 * The first time a thread calls ThreadRegistry::getTID() it will allocate a free slot in 'usedTID[]'.
 * This tid wil be saved in a thread-local variable of the type ThreadCheckInCheckOut which
 * upon destruction of the thread will call the destructor of ThreadCheckInCheckOut and free the
 * corresponding slot to be used by a later thread.
 * RomulusLR relies on this to work properly.
 */
class ThreadRegistry {
public:
    static const int                    REGISTRY_MAX_THREADS = 128;

private:
    alignas(128) std::atomic<bool>      usedTID[REGISTRY_MAX_THREADS];   // Which TIDs are in use by threads
    alignas(128) std::atomic<int>       maxTid {-1};                     // Highest TID (+1) in use by threads

public:
    ThreadRegistry() {
        for (int it = 0; it < REGISTRY_MAX_THREADS; it++) {
            usedTID[it].store(false, std::memory_order_relaxed);
        }
    }

    // Progress Condition: wait-free bounded (by the number of threads)
    int register_thread_new(void) {
        for (int tid = 0; tid < REGISTRY_MAX_THREADS; tid++) {
            if (usedTID[tid].load(std::memory_order_acquire)) continue;
            bool unused = false;
            if (!usedTID[tid].compare_exchange_strong(unused, true)) continue;
            // Increase the current maximum to cover our thread id
            int curMax = maxTid.load();
            while (curMax <= tid) {
                maxTid.compare_exchange_strong(curMax, tid+1);
                curMax = maxTid.load();
            }
            tl_gc_tcico.tid = tid;
            return tid;
        }
        std::cout << "ERROR: Too many threads, registry can only hold " << REGISTRY_MAX_THREADS << " threads\n";
        assert(false);
    }

    // Progress condition: wait-free population oblivious
    inline void deregister_thread(const int tid) {
        stm_exit_thread();    // Needed by TinySTM
        usedTID[tid].store(false, std::memory_order_release);
    }

    // Progress condition: wait-free population oblivious
    static inline uint64_t getMaxThreads(void) {
        return gThreadRegistry.maxTid.load(std::memory_order_acquire);
    }

    // Progress condition: wait-free bounded (by the number of threads)
    static inline int getTID(void) {
        int tid = tl_gc_tcico.tid;
        if (tid != ThreadCheckInCheckOut::NOT_ASSIGNED) return tid;
        stm_init_thread();   // Needed by TinySTM
        return gThreadRegistry.register_thread_new();
    }
};


class TinySTM;
extern TinySTM gTinySTM;

class TinySTM {

private:
    // Maximum number of participating threads
    static const uint64_t MAX_THREADS = 128;

public:
    TinySTM(unsigned int maxThreads=MAX_THREADS) {
        stm_init();
        mod_mem_init(0);
        mod_ab_init(0, NULL);
    }

    ~TinySTM() {
        stm_exit();
    }

    static std::string className() { return "TinySTM"; }

    template<typename R, class F>
    static R updateTx(F&& func) {
        const unsigned int tid = ThreadRegistry::getTID();
        R retval{};
        stm_tx_attr_t _a = {{.id = (unsigned int)tid, .read_only = false}};
        sigjmp_buf *_e = stm_start(_a);
        sigsetjmp(*_e, 0);
        retval = func();
        stm_commit();
        return retval;
    }

    template<class F>
    static void updateTx(F&& func) {
        const unsigned int tid = ThreadRegistry::getTID();
        stm_tx_attr_t _a = {{.id = (unsigned int)tid, .read_only = false}};
        sigjmp_buf *_e = stm_start(_a);
        sigsetjmp(*_e, 0);
        func();
        stm_commit();
    }

    template<typename R, class F>
    static R readTx(F&& func) {
        const unsigned int tid = ThreadRegistry::getTID();
        R retval{};
        stm_tx_attr_t _a = {{.id = (unsigned int)tid, .read_only = true}};
        sigjmp_buf *_e = stm_start(_a);
        sigsetjmp(*_e, 0);
        retval = func();
        stm_commit();
        return retval;
    }

    template<class F>
    static void readTx(F&& func) {
        const int tid = ThreadRegistry::getTID();
        stm_tx_attr_t _a = {{.id = (unsigned int)tid, .read_only = true}};
        sigjmp_buf *_e = stm_start(_a);
        sigsetjmp(*_e, 0);
        func();
        stm_commit();
    }

    template <typename T, typename... Args>
    static T* tmNew(Args&&... args) {
        void* addr = stm_malloc(sizeof(T));
        assert(addr != NULL);
        T* ptr = new (addr) T(std::forward<Args>(args)...);
        return ptr;
    }

    template<typename T>
    static void tmDelete(T* obj) {
        if (obj == nullptr) return;
        obj->~T();
        stm_free(obj, sizeof(T));
    }

    static void* tmMalloc(size_t size) {
        return stm_malloc(size);
    }

    static void tmFree(void* obj) {
        stm_free(obj, 0);
    }
};


// T is typically a pointer to a node, but it can be integers or other stuff, as long as it fits in 64 bits
template<typename T>
struct tmtype {
    T val {};

    tmtype() { }

    tmtype(T initVal) : val{initVal} {}

    // Casting operator
    operator T() {
        return load();
    }

    // Prefix increment operator: ++x
    void operator++ () {
        store(load()+1);
    }

    // Prefix decrement operator: --x
    void operator-- () {
        store(load()-1);
    }

    void operator++ (int) {
        store(load()+1);
    }

    void operator-- (int) {
        store(load()-1);
    }

    // Equals operator: first downcast to T and then compare
    bool operator == (const T& otherval) const {
        return load() == otherval;
    }

    // Difference operator: first downcast to T and then compare
    bool operator != (const T& otherval) const {
        return load() != otherval;
    }

    // Operator arrow ->
    T operator->() {
        return load();
    }

    // Copy constructor
    tmtype<T>(const tmtype<T>& other) {
        store(other.load());
    }

    // Assignment operator from an tmtype
    tmtype<T>& operator=(const tmtype<T>& other) {
        store(other.load());
        return *this;
    }

    // Assignment operator from a value
    tmtype<T>& operator=(T value) {
        store(value);
        return *this;
    }

    inline void store(T newVal) {
        stm_store((stm_word_t *)&val, (stm_word_t)newVal);
    }

    // Meant to be called when know we're the only ones touching
    // these contents, for example, in the constructor of an object, before
    // making the object visible to other threads.
    inline void isolated_store(T newVal) {
        val = newVal;
    }

    inline T load() const {
        return (T)stm_load((stm_word_t *)&val);
    }
};

extern TinySTM gTinySTM;


// Wrapper to not do any transaction
template<typename R, typename Func>
R notx(Func&& func) {
    return func();
}

template<typename R, typename F> static R updateTx(F&& func) { return gTinySTM.updateTx<R>(func); }
template<typename R, typename F> static R readTx(F&& func) { return gTinySTM.readTx<R>(func); }
template<typename F> static void updateTx(F&& func) { gTinySTM.updateTx(func); }
template<typename F> static void readTx(F&& func) { gTinySTM.readTx(func); }
template<typename T, typename... Args> T* tmNew(Args&&... args) { return gTinySTM.tmNew<T>(args...); }
template<typename T>void tmDelete(T* obj) { gTinySTM.tmDelete<T>(obj); }
static void* tmMalloc(size_t size) { return TinySTM::tmMalloc(size); }
static void tmFree(void* obj) { TinySTM::tmFree(obj); }

static int getTID(void) { return ThreadRegistry::getTID(); }


//
// Place these in a .cpp if you include this header in multiple files
//
TinySTM gTinySTM {};
// Global/singleton to hold all the thread registry functionality
ThreadRegistry gThreadRegistry {};
// This is where every thread stores the tid it has been assigned when it calls getTID() for the first time.
// When the thread dies, the destructor of ThreadCheckInCheckOut will be called and de-register the thread.
thread_local ThreadCheckInCheckOut tl_gc_tcico {};
// Helper function for thread de-registration
void thread_registry_deregister_thread(const int tid) {
    gThreadRegistry.deregister_thread(tid);
}


}

#endif /* _TINY_STM_TRANSACTIONAL_MEMORY_WRAPPER_H_ */


================================================
FILE: stms/estm-0.3.0/.gitignore
================================================
lib/libstm.a
src/*.o


================================================
FILE: stms/estm-0.3.0/AUTHORS
================================================
ESTM is maintained by:

    Vincent Gramoli (vincent.gramoli@sydney.edu.au)

The following people have contributed to ESTM:

    Normal transactions (tinystm): Pascal Felber, Patrick Marlier


================================================
FILE: stms/estm-0.3.0/COPYING
================================================
                    GNU GENERAL PUBLIC LICENSE
                       Version 3, 29 June 2007

 Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
 Everyone is permitted to copy and distribute verbatim copies
 of this license document, but changing it is not allowed.

                            Preamble

  The GNU General Public License is a free, copyleft license for
software and other kinds of works.

  The licenses for most software and other practical works are designed
to take away your freedom to share and change the works.  By contrast,
the GNU General Public License is intended to guarantee your freedom to
share and change all versions of a program--to make sure it remains free
software for all its users.  We, the Free Software Foundation, use the
GNU General Public License for most of our software; it applies also to
any other work released this way by its authors.  You can apply it to
your programs, too.

  When we speak of free software, we are referring to freedom, not
price.  Our General Public Licenses are designed to make sure that you
have the freedom to distribute copies of free software (and charge for
them if you wish), that you receive source code or can get it if you
want it, that you can change the software or use pieces of it in new
free programs, and that you know you can do these things.

  To protect your rights, we need to prevent others from denying you
these rights or asking you to surrender the rights.  Therefore, you have
certain responsibilities if you distribute copies of the software, or if
you modify it: responsibilities to respect the freedom of others.

  For example, if you distribute copies of such a program, whether
gratis or for a fee, you must pass on to the recipients the same
freedoms that you received.  You must make sure that they, too, receive
or can get the source code.  And you must show them these terms so they
know their rights.

  Developers that use the GNU GPL protect your rights with two steps:
(1) assert copyright on the software, and (2) offer you this License
giving you legal permission to copy, distribute and/or modify it.

  For the developers' and authors' protection, the GPL clearly explains
that there is no warranty for this free software.  For both users' and
authors' sake, the GPL requires that modified versions be marked as
changed, so that their problems will not be attributed erroneously to
authors of previous versions.

  Some devices are designed to deny users access to install or run
modified versions of the software inside them, although the manufacturer
can do so.  This is fundamentally incompatible with the aim of
protecting users' freedom to change the software.  The systematic
pattern of such abuse occurs in the area of products for individuals to
use, which is precisely where it is most unacceptable.  Therefore, we
have designed this version of the GPL to prohibit the practice for those
products.  If such problems arise substantially in other domains, we
stand ready to extend this provision to those domains in future versions
of the GPL, as needed to protect the freedom of users.

  Finally, every program is threatened constantly by software patents.
States should not allow patents to restrict development and use of
software on general-purpose computers, but in those that do, we wish to
avoid the special danger that patents applied to a free program could
make it effectively proprietary.  To prevent this, the GPL assures that
patents cannot be used to render the program non-free.

  The precise terms and conditions for copying, distribution and
modification follow.

                       TERMS AND CONDITIONS

  0. Definitions.

  "This License" refers to version 3 of the GNU General Public License.

  "Copyright" also means copyright-like laws that apply to other kinds of
works, such as semiconductor masks.

  "The Program" refers to any copyrightable work licensed under this
License.  Each licensee is addressed as "you".  "Licensees" and
"recipients" may be individuals or organizations.

  To "modify" a work means to copy from or adapt all or part of the work
in a fashion requiring copyright permission, other than the making of an
exact copy.  The resulting work is called a "modified version" of the
earlier work or a work "based on" the earlier work.

  A "covered work" means either the unmodified Program or a work based
on the Program.

  To "propagate" a work means to do anything with it that, without
permission, would make you directly or secondarily liable for
infringement under applicable copyright law, except executing it on a
computer or modifying a private copy.  Propagation includes copying,
distribution (with or without modification), making available to the
public, and in some countries other activities as well.

  To "convey" a work means any kind of propagation that enables other
parties to make or receive copies.  Mere interaction with a user through
a computer network, with no transfer of a copy, is not conveying.

  An interactive user interface displays "Appropriate Legal Notices"
to the extent that it includes a convenient and prominently visible
feature that (1) displays an appropriate copyright notice, and (2)
tells the user that there is no warranty for the work (except to the
extent that warranties are provided), that licensees may convey the
work under this License, and how to view a copy of this License.  If
the interface presents a list of user commands or options, such as a
menu, a prominent item in the list meets this criterion.

  1. Source Code.

  The "source code" for a work means the preferred form of the work
for making modifications to it.  "Object code" means any non-source
form of a work.

  A "Standard Interface" means an interface that either is an official
standard defined by a recognized standards body, or, in the case of
interfaces specified for a particular programming language, one that
is widely used among developers working in that language.

  The "System Libraries" of an executable work include anything, other
than the work as a whole, that (a) is included in the normal form of
packaging a Major Component, but which is not part of that Major
Component, and (b) serves only to enable use of the work with that
Major Component, or to implement a Standard Interface for which an
implementation is available to the public in source code form.  A
"Major Component", in this context, means a major essential component
(kernel, window system, and so on) of the specific operating system
(if any) on which the executable work runs, or a compiler used to
produce the work, or an object code interpreter used to run it.

  The "Corresponding Source" for a work in object code form means all
the source code needed to generate, install, and (for an executable
work) run the object code and to modify the work, including scripts to
control those activities.  However, it does not include the work's
System Libraries, or general-purpose tools or generally available free
programs which are used unmodified in performing those activities but
which are not part of the work.  For example, Corresponding Source
includes interface definition files associated with source files for
the work, and the source code for shared libraries and dynamically
linked subprograms that the work is specifically designed to require,
such as by intimate data communication or control flow between those
subprograms and other parts of the work.

  The Corresponding Source need not include anything that users
can regenerate automatically from other parts of the Corresponding
Source.

  The Corresponding Source for a work in source code form is that
same work.

  2. Basic Permissions.

  All rights granted under this License are granted for the term of
copyright on the Program, and are irrevocable provided the stated
conditions are met.  This License explicitly affirms your unlimited
permission to run the unmodified Program.  The output from running a
covered work is covered by this License only if the output, given its
content, constitutes a covered work.  This License acknowledges your
rights of fair use or other equivalent, as provided by copyright law.

  You may make, run and propagate covered works that you do not
convey, without conditions so long as your license otherwise remains
in force.  You may convey covered works to others for the sole purpose
of having them make modifications exclusively for you, or provide you
with facilities for running those works, provided that you comply with
the terms of this License in conveying all material for which you do
not control copyright.  Those thus making or running the covered works
for you must do so exclusively on your behalf, under your direction
and control, on terms that prohibit them from making any copies of
your copyrighted material outside their relationship with you.

  Conveying under any other circumstances is permitted solely under
the conditions stated below.  Sublicensing is not allowed; section 10
makes it unnecessary.

  3. Protecting Users' Legal Rights From Anti-Circumvention Law.

  No covered work shall be deemed part of an effective technological
measure under any applicable law fulfilling obligations under article
11 of the WIPO copyright treaty adopted on 20 December 1996, or
similar laws prohibiting or restricting circumvention of such
measures.

  When you convey a covered work, you waive any legal power to forbid
circumvention of technological measures to the extent such circumvention
is effected by exercising rights under this License with respect to
the covered work, and you disclaim any intention to limit operation or
modification of the work as a means of enforcing, against the work's
users, your or third parties' legal rights to forbid circumvention of
technological measures.

  4. Conveying Verbatim Copies.

  You may convey verbatim copies of the Program's source code as you
receive it, in any medium, provided that you conspicuously and
appropriately publish on each copy an appropriate copyright notice;
keep intact all notices stating that this License and any
non-permissive terms added in accord with section 7 apply to the code;
keep intact all notices of the absence of any warranty; and give all
recipients a copy of this License along with the Program.

  You may charge any price or no price for each copy that you convey,
and you may offer support or warranty protection for a fee.

  5. Conveying Modified Source Versions.

  You may convey a work based on the Program, or the modifications to
produce it from the Program, in the form of source code under the
terms of section 4, provided that you also meet all of these conditions:

    a) The work must carry prominent notices stating that you modified
    it, and giving a relevant date.

    b) The work must carry prominent notices stating that it is
    released under this License and any conditions added under section
    7.  This requirement modifies the requirement in section 4 to
    "keep intact all notices".

    c) You must license the entire work, as a whole, under this
    License to anyone who comes into possession of a copy.  This
    License will therefore apply, along with any applicable section 7
    additional terms, to the whole of the work, and all its parts,
    regardless of how they are packaged.  This License gives no
    permission to license the work in any other way, but it does not
    invalidate such permission if you have separately received it.

    d) If the work has interactive user interfaces, each must display
    Appropriate Legal Notices; however, if the Program has interactive
    interfaces that do not display Appropriate Legal Notices, your
    work need not make them do so.

  A compilation of a covered work with other separate and independent
works, which are not by their nature extensions of the covered work,
and which are not combined with it such as to form a larger program,
in or on a volume of a storage or distribution medium, is called an
"aggregate" if the compilation and its resulting copyright are not
used to limit the access or legal rights of the compilation's users
beyond what the individual works permit.  Inclusion of a covered work
in an aggregate does not cause this License to apply to the other
parts of the aggregate.

  6. Conveying Non-Source Forms.

  You may convey a covered work in object code form under the terms
of sections 4 and 5, provided that you also convey the
machine-readable Corresponding Source under the terms of this License,
in one of these ways:

    a) Convey the object code in, or embodied in, a physical product
    (including a physical distribution medium), accompanied by the
    Corresponding Source fixed on a durable physical medium
    customarily used for software interchange.

    b) Convey the object code in, or embodied in, a physical product
    (including a physical distribution medium), accompanied by a
    written offer, valid for at least three years and valid for as
    long as you offer spare parts or customer support for that product
    model, to give anyone who possesses the object code either (1) a
    copy of the Corresponding Source for all the software in the
    product that is covered by this License, on a durable physical
    medium customarily used for software interchange, for a price no
    more than your reasonable cost of physically performing this
    conveying of source, or (2) access to copy the
    Corresponding Source from a network server at no charge.

    c) Convey individual copies of the object code with a copy of the
    written offer to provide the Corresponding Source.  This
    alternative is allowed only occasionally and noncommercially, and
    only if you received the object code with such an offer, in accord
    with subsection 6b.

    d) Convey the object code by offering access from a designated
    place (gratis or for a charge), and offer equivalent access to the
    Corresponding Source in the same way through the same place at no
    further charge.  You need not require recipients to copy the
    Corresponding Source along with the object code.  If the place to
    copy the object code is a network server, the Corresponding Source
    may be on a different server (operated by you or a third party)
    that supports equivalent copying facilities, provided you maintain
    clear directions next to the object code saying where to find the
    Corresponding Source.  Regardless of what server hosts the
    Corresponding Source, you remain obligated to ensure that it is
    available for as long as needed to satisfy these requirements.

    e) Convey the object code using peer-to-peer transmission, provided
    you inform other peers where the object code and Corresponding
    Source of the work are being offered to the general public at no
    charge under subsection 6d.

  A separable portion of the object code, whose source code is excluded
from the Corresponding Source as a System Library, need not be
included in conveying the object code work.

  A "User Product" is either (1) a "consumer product", which means any
tangible personal property which is normally used for personal, family,
or household purposes, or (2) anything designed or sold for incorporation
into a dwelling.  In determining whether a product is a consumer product,
doubtful cases shall be resolved in favor of coverage.  For a particular
product received by a particular user, "normally used" refers to a
typical or common use of that class of product, regardless of the status
of the particular user or of the way in which the particular user
actually uses, or expects or is expected to use, the product.  A product
is a consumer product regardless of whether the product has substantial
commercial, industrial or non-consumer uses, unless such uses represent
the only significant mode of use of the product.

  "Installation Information" for a User Product means any methods,
procedures, authorization keys, or other information required to install
and execute modified versions of a covered work in that User Product from
a modified version of its Corresponding Source.  The information must
suffice to ensure that the continued functioning of the modified object
code is in no case prevented or interfered with solely because
modification has been made.

  If you convey an object code work under this section in, or with, or
specifically for use in, a User Product, and the conveying occurs as
part of a transaction in which the right of possession and use of the
User Product is transferred to the recipient in perpetuity or for a
fixed term (regardless of how the transaction is characterized), the
Corresponding Source conveyed under this section must be accompanied
by the Installation Information.  But this requirement does not apply
if neither you nor any third party retains the ability to install
modified object code on the User Product (for example, the work has
been installed in ROM).

  The requirement to provide Installation Information does not include a
requirement to continue to provide support service, warranty, or updates
for a work that has been modified or installed by the recipient, or for
the User Product in which it has been modified or installed.  Access to a
network may be denied when the modification itself materially and
adversely affects the operation of the network or violates the rules and
protocols for communication across the network.

  Corresponding Source conveyed, and Installation Information provided,
in accord with this section must be in a format that is publicly
documented (and with an implementation available to the public in
source code form), and must require no special password or key for
unpacking, reading or copying.

  7. Additional Terms.

  "Additional permissions" are terms that supplement the terms of this
License by making exceptions from one or more of its conditions.
Additional permissions that are applicable to the entire Program shall
be treated as though they were included in this License, to the extent
that they are valid under applicable law.  If additional permissions
apply only to part of the Program, that part may be used separately
under those permissions, but the entire Program remains governed by
this License without regard to the additional permissions.

  When you convey a copy of a covered work, you may at your option
remove any additional permissions from that copy, or from any part of
it.  (Additional permissions may be written to require their own
removal in certain cases when you modify the work.)  You may place
additional permissions on material, added by you to a covered work,
for which you have or can give appropriate copyright permission.

  Notwithstanding any other provision of this License, for material you
add to a covered work, you may (if authorized by the copyright holders of
that material) supplement the terms of this License with terms:

    a) Disclaiming warranty or limiting liability differently from the
    terms of sections 15 and 16 of this License; or

    b) Requiring preservation of specified reasonable legal notices or
    author attributions in that material or in the Appropriate Legal
    Notices displayed by works containing it; or

    c) Prohibiting misrepresentation of the origin of that material, or
    requiring that modified versions of such material be marked in
    reasonable ways as different from the original version; or

    d) Limiting the use for publicity purposes of names of licensors or
    authors of the material; or

    e) Declining to grant rights under trademark law for use of some
    trade names, trademarks, or service marks; or

    f) Requiring indemnification of licensors and authors of that
    material by anyone who conveys the material (or modified versions of
    it) with contractual assumptions of liability to the recipient, for
    any liability that these contractual assumptions directly impose on
    those licensors and authors.

  All other non-permissive additional terms are considered "further
restrictions" within the meaning of section 10.  If the Program as you
received it, or any part of it, contains a notice stating that it is
governed by this License along with a term that is a further
restriction, you may remove that term.  If a license document contains
a further restriction but permits relicensing or conveying under this
License, you may add to a covered work material governed by the terms
of that license document, provided that the further restriction does
not survive such relicensing or conveying.

  If you add terms to a covered work in accord with this section, you
must place, in the relevant source files, a statement of the
additional terms that apply to those files, or a notice indicating
where to find the applicable terms.

  Additional terms, permissive or non-permissive, may be stated in the
form of a separately written license, or stated as exceptions;
the above requirements apply either way.

  8. Termination.

  You may not propagate or modify a covered work except as expressly
provided under this License.  Any attempt otherwise to propagate or
modify it is void, and will automatically terminate your rights under
this License (including any patent licenses granted under the third
paragraph of section 11).

  However, if you cease all violation of this License, then your
license from a particular copyright holder is reinstated (a)
provisionally, unless and until the copyright holder explicitly and
finally terminates your license, and (b) permanently, if the copyright
holder fails to notify you of the violation by some reasonable means
prior to 60 days after the cessation.

  Moreover, your license from a particular copyright holder is
reinstated permanently if the copyright holder notifies you of the
violation by some reasonable means, this is the first time you have
received notice of violation of this License (for any work) from that
copyright holder, and you cure the violation prior to 30 days after
your receipt of the notice.

  Termination of your rights under this section does not terminate the
licenses of parties who have received copies or rights from you under
this License.  If your rights have been terminated and not permanently
reinstated, you do not qualify to receive new licenses for the same
material under section 10.

  9. Acceptance Not Required for Having Copies.

  You are not required to accept this License in order to receive or
run a copy of the Program.  Ancillary propagation of a covered work
occurring solely as a consequence of using peer-to-peer transmission
to receive a copy likewise does not require acceptance.  However,
nothing other than this License grants you permission to propagate or
modify any covered work.  These actions infringe copyright if you do
not accept this License.  Therefore, by modifying or propagating a
covered work, you indicate your acceptance of this License to do so.

  10. Automatic Licensing of Downstream Recipients.

  Each time you convey a covered work, the recipient automatically
receives a license from the original licensors, to run, modify and
propagate that work, subject to this License.  You are not responsible
for enforcing compliance by third parties with this License.

  An "entity transaction" is a transaction transferring control of an
organization, or substantially all assets of one, or subdividing an
organization, or merging organizations.  If propagation of a covered
work results from an entity transaction, each party to that
transaction who receives a copy of the work also receives whatever
licenses to the work the party's predecessor in interest had or could
give under the previous paragraph, plus a right to possession of the
Corresponding Source of the work from the predecessor in interest, if
the predecessor has it or can get it with reasonable efforts.

  You may not impose any further restrictions on the exercise of the
rights granted or affirmed under this License.  For example, you may
not impose a license fee, royalty, or other charge for exercise of
rights granted under this License, and you may not initiate litigation
(including a cross-claim or counterclaim in a lawsuit) alleging that
any patent claim is infringed by making, using, selling, offering for
sale, or importing the Program or any portion of it.

  11. Patents.

  A "contributor" is a copyright holder who authorizes use under this
License of the Program or a work on which the Program is based.  The
work thus licensed is called the contributor's "contributor version".

  A contributor's "essential patent claims" are all patent claims
owned or controlled by the contributor, whether already acquired or
hereafter acquired, that would be infringed by some manner, permitted
by this License, of making, using, or selling its contributor version,
but do not include claims that would be infringed only as a
consequence of further modification of the contributor version.  For
purposes of this definition, "control" includes the right to grant
patent sublicenses in a manner consistent with the requirements of
this License.

  Each contributor grants you a non-exclusive, worldwide, royalty-free
patent license under the contributor's essential patent claims, to
make, use, sell, offer for sale, import and otherwise run, modify and
propagate the contents of its contributor version.

  In the following three paragraphs, a "patent license" is any express
agreement or commitment, however denominated, not to enforce a patent
(such as an express permission to practice a patent or covenant not to
sue for patent infringement).  To "grant" such a patent license to a
party means to make such an agreement or commitment not to enforce a
patent against the party.

  If you convey a covered work, knowingly relying on a patent license,
and the Corresponding Source of the work is not available for anyone
to copy, free of charge and under the terms of this License, through a
publicly available network server or other readily accessible means,
then you must either (1) cause the Corresponding Source to be so
available, or (2) arrange to deprive yourself of the benefit of the
patent license for this particular work, or (3) arrange, in a manner
consistent with the requirements of this License, to extend the patent
license to downstream recipients.  "Knowingly relying" means you have
actual knowledge that, but for the patent license, your conveying the
covered work in a country, or your recipient's use of the covered work
in a country, would infringe one or more identifiable patents in that
country that you have reason to believe are valid.

  If, pursuant to or in connection with a single transaction or
arrangement, you convey, or propagate by procuring conveyance of, a
covered work, and grant a patent license to some of the parties
receiving the covered work authorizing them to use, propagate, modify
or convey a specific copy of the covered work, then the patent license
you grant is automatically extended to all recipients of the covered
work and works based on it.

  A patent license is "discriminatory" if it does not include within
the scope of its coverage, prohibits the exercise of, or is
conditioned on the non-exercise of one or more of the rights that are
specifically granted under this License.  You may not convey a covered
work if you are a party to an arrangement with a third party that is
in the business of distributing software, under which you make payment
to the third party based on the extent of your activity of conveying
the work, and under which the third party grants, to any of the
parties who would receive the covered work from you, a discriminatory
patent license (a) in connection with copies of the covered work
conveyed by you (or copies made from those copies), or (b) primarily
for and in connection with specific products or compilations that
contain the covered work, unless you entered into that arrangement,
or that patent license was granted, prior to 28 March 2007.

  Nothing in this License shall be construed as excluding or limiting
any implied license or other defenses to infringement that may
otherwise be available to you under applicable patent law.

  12. No Surrender of Others' Freedom.

  If conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License.  If you cannot convey a
covered work so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you may
not convey it at all.  For example, if you agree to terms that obligate you
to collect a royalty for further conveying from those to whom you convey
the Program, the only way you could satisfy both those terms and this
License would be to refrain entirely from conveying the Program.

  13. Use with the GNU Affero General Public License.

  Notwithstanding any other provision of this License, you have
permission to link or combine any covered work with a work licensed
under version 3 of the GNU Affero General Public License into a single
combined work, and to convey the resulting work.  The terms of this
License will continue to apply to the part which is the covered work,
but the special requirements of the GNU Affero General Public License,
section 13, concerning interaction through a network will apply to the
combination as such.

  14. Revised Versions of this License.

  The Free Software Foundation may publish revised and/or new versions of
the GNU General Public License from time to time.  Such new versions will
be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.

  Each version is given a distinguishing version number.  If the
Program specifies that a certain numbered version of the GNU General
Public License "or any later version" applies to it, you have the
option of following the terms and conditions either of that numbered
version or of any later version published by the Free Software
Foundation.  If the Program does not specify a version number of the
GNU General Public License, you may choose any version ever published
by the Free Software Foundation.

  If the Program specifies that a proxy can decide which future
versions of the GNU General Public License can be used, that proxy's
public statement of acceptance of a version permanently authorizes you
to choose that version for the Program.

  Later license versions may give you additional or different
permissions.  However, no additional obligations are imposed on any
author or copyright holder as a result of your choosing to follow a
later version.

  15. Disclaimer of Warranty.

  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.

  16. Limitation of Liability.

  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
SUCH DAMAGES.

  17. Interpretation of Sections 15 and 16.

  If the disclaimer of warranty and limitation of liability provided
above cannot be given local legal effect according to their terms,
reviewing courts shall apply local law that most closely approximates
an absolute waiver of all civil liability in connection with the
Program, unless a warranty or assumption of liability accompanies a
copy of the Program in return for a fee.

                     END OF TERMS AND CONDITIONS

            How to Apply These Terms to Your New Programs

  If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these terms.

  To do so, attach the following notices to the program.  It is safest
to attach them to the start of each source file to most effectively
state the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.

    <one line to give the program's name and a brief idea of what it does.>
    Copyright (C) <year>  <name of author>

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.

Also add information on how to contact you by electronic and paper mail.

  If the program does terminal interaction, make it output a short
notice like this when it starts in an interactive mode:

    <program>  Copyright (C) <year>  <name of author>
    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
    This is free software, and you are welcome to redistribute it
    under certain conditions; type `show c' for details.

The hypothetical commands `show w' and `show c' should show the appropriate
parts of the General Public License.  Of course, your program's commands
might be different; for a GUI interface, you would use an "about box".

  You should also get your employer (if you work as a programmer) or school,
if any, to sign a "copyright disclaimer" for the program, if necessary.
For more information on this, and how to apply and follow the GNU GPL, see
<http://www.gnu.org/licenses/>.

  The GNU General Public License does not permit incorporating your program
into proprietary programs.  If your program is a subroutine library, you
may consider it more useful to permit linking proprietary applications with
the library.  If this is what you want to do, use the GNU Lesser General
Public License instead of this License.  But first, please read
<http://www.gnu.org/philosophy/why-not-lgpl.html>.


================================================
FILE: stms/estm-0.3.0/Makefile
================================================
include Makefile.in

CFLAGS += -std=gnu89 -Wno-sequence-point
CFLAGS += -I$(SRCDIR) 
CFLAGS += $(DEFINES) 
MODULES := $(patsubst %.c,%.o,$(wildcard $(SRCDIR)/mod_*.c))

.PHONY:	all clean

all:	$(TMLIB)

$(TMLIB):	$(SRCDIR)/$(TM).o $(SRCDIR)/wrappers.o $(SRCDIR)/gc.o $(MODULES)
	@mkdir -p lib
	$(AR) cru $@ $^

clean:
	rm -f $(TMLIB) $(SRCDIR)/*.o


================================================
FILE: stms/estm-0.3.0/Makefile.in
================================================
################
# Local settings
################

# Paths
SOLARIS_CC ?= gcc

# Version
ifeq ($(VERSION),DEBUG)
     CFLAGS 	+= -g -DDEBUG -O0
else
     CFLAGS 	+= -O3 -DNDEBUG
endif

ROOT ?= .
SBROOT ?= ../../..
SRCDIR = $(ROOT)/src
INCDIR = $(ROOT)/include
LIBDIR = $(ROOT)/lib

# Path to LIBATOMIC_OPS (or to gcc-specific libatomic_ops)
ifdef LIBAO_HOME
  LIBAO_INC = $(LIBAO_HOME)/include
else
  LIBAO_INC = src/atomic_ops 
endif

TMLIB = $(LIBDIR)/lib$(TM).a
TM = stm

#############################
# Platform dependent settings
#############################
#
# gcc thread-local storage requires "significant 
# support from the linker (ld), dynamic linker
# (ld.so), and system libraries (libc.so and libpthread.so), so it is
# not available everywhere." source: gcc-doc.

ifndef OS_NAME
    OS_NAME = $(shell uname -s)
endif

ifeq ($(OS_NAME), Darwin)
    OS = MacOS
    DEFINES += -UTLS
endif

ifeq ($(OS_NAME), Linux)
    OS = Linux
    DEFINES += -DTLS
endif

ifeq ($(OS_NAME), SunOS)
    OS = Solaris
    CC = $(SOLARIS_CC)
    DEFINES += -DTLS
    AR = /usr/ccs/bin/ar
endif

#################################
# Architecture dependent settings
#################################

ifndef ARCH
    ARCH_NAME = $(shell uname -m)
endif

ifeq ($(ARCH_NAME), i386)
    ARCH = x86
    CFLAGS += -m32
    LDFLAGS += -m32
endif

ifeq ($(ARCH_NAME), i686)
    ARCH = x86
    CFLAGS += -m32
    LDFLAGS += -m32
endif

ifeq ($(ARCH_NAME), x86_64)
    ARCH = x86_64
    CFLAGS += -m64
    LDFLAGS += -m64
endif

ifeq ($(ARCH_NAME), sun4v)
    ARCH = sparc64
    CFLAGS += -DSPARC=1 -DINLINED=1 -mt -fast -m64
    LDFLAGS += -lrt -m64
endif

#################
# Global settings
#################

CFLAGS += -Wall
CFLAGS += -I$(LIBAO_INC) -I$(ROOT)/include
LDFLAGS += -L$(ROOT)/lib -lstm -lpthread 

#################
# MEMORY MGMT
#################

ifeq ($(MALLOC), TC)
  LDFLAGS += -ltcmalloc
  CFLAGS += -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free
endif


================================================
FILE: stms/estm-0.3.0/README
================================================
OVERVIEW
--------
e-STM is the first software transactional memory supporting elastic 
transactions. Elastic transactions are a variant of the 
transactional model. Upon conflict detection, an elastic 
transaction might drop what it did so far within a separate 
transaction that immediately commits, and initiate a new 
transaction which might itself be elastic.

Elastic transactions are a complementary alternative to traditional 
transactions, particularly appealing when implementing search 
structures. Both forms of transactions can safely be combined 
within the same application. e-STM provides elastic transactions and 
the normal transaction of tinySTM but elastic ones increase performance 
by 35% (on average). Our measurements are taken from the micro-benchmarks 
available at http://lpd.epfl.ch/gramoli/php/estm.php

INSTALL
-------
> make

TEST
----
Download microbench and set ESTMDIR env variable before 
compiling it with e-STM using:

> make estm

Examples of benchmarks:
1. If you want to execute insert/delete/search (90%/5%/5%) operations 
on a skip list implementation of an integer set with an average 
size of 1024 (-i1024) elements, using 32 threads and using 
elastic transactions for all operations (-x4)

> ./bench/lf-skiplist -x4 -u10 -i1024 -r2048 -n32 

32 threads execute 10% of attempted updates on 1024 elements with a range of 2048 values.

2. If you want 10% effective updates (i.e., 10% of operations write to the 
memory), then you should better use the alternate mode (-A) where the 
delete targets the last inserted value and the effective flag (-f1)

> ./bench/lf-skiplist -x4 -u10 -i1024 -r2048 -n32 -f1 -A -r2048

32 threads execute 10% of effective updates on 1024 elements with a range of 2048 values.

e-STM has been tested on an 
- 4 Quad-Core AMD Opteron 

CONTACT
-------
vincent.gramoli@epfl.ch
http://lpd.epfl.ch/gramoli/php/estm.php


================================================
FILE: stms/estm-0.3.0/VERSIONS
================================================
[0.3.0 release]

2011-07-28  Vincent Gramoli  <vincent.gramoli@epfl.ch>
	* Add rotating buffer of size k 
	for checking the k past read accesses

[0.2.8 release]

2011-06-26  Vincent Gramoli  <vincent.gramoli@epfl.ch>
        * Add atomic_ops source code
	* Clean-up Makefile.in

[0.2.7 release]

2010-05-31  Vincent Gramoli  <vincent.gramoli@epfl.ch>
        * Removed dead-code
	* Removed stale functions (unit-l/s, set-extension...)

[0.2.6 release]

2010-02-23  Vincent Gramoli  <vincent.gramoli@epfl.ch>
        * Thread-local-storage newly enabled for compliant platform.

[0.2.5 release]

2009-09-29  Vincent Gramoli  <vincent.gramoli@epfl.ch>
        * Makefile bug about the garbage collector inclusion fixed

[0.2.0 release]

2009-06-19  Vincent Gramoli  <vincent.gramoli@epfl.ch>
        * Common interface TX_START/TX_LOAD/TX_STORE/TX_END
	  for any type of transactions (elastic/normal)

2009-06-10  Vincent Gramoli  <vincent.gramoli@epfl.ch>
	* Removed ro mode

[0.1.8 release]

2009-06-10  Vincent Gramoli  <vincent.gramoli@epfl.ch>
	* Enhance efficiency for elastic functions.
	* stm_normal_load/store for normal transactions.
	* Removed support for stm_unit_write 
	  (elastic_store switches to tx type to normal).

[0.1.7 release]

2009-05-23  Vincent Gramoli  <vincent.gramoli@epfl.ch>
	* Debug of function stm_elastic_load.
	* Added stm_elastic_store functions.

2009-05-20  Vincent Gramoli  <vincent.gramoli@epfl.ch>
        * Added stm_unit_load/stm_unit_write to stm.c
	* Upgrade normal transactions to tinySTM-v0.9.9

[0.1.4 release]

2008-12-01  Vincent Gramoli  <vincent.gramoli@epfl.ch>
	* Added -x option for unit load/store.

[0.1.0 release]

2008-11-01  Vincent Gramoli  <vincent.gramoli@epfl.ch>
	* Initial release.


================================================
FILE: stms/estm-0.3.0/include/mod_local.h
================================================
/*
 * File:
 *   mod_local.h
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 * Description:
 *   Module for local memory accesses.
 *
 * Copyright (c) 2007-2009.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */

/**
 * @file
 *   Module for local memory accesses.  Data is both written to memory
 *   and stored in an undo log.  Upon abort, modifications are reverted.
 *   Note that this module should not be used for updating shared data
 *   as there are no mechanisms to deal with concurrent accesses.
 * @author
 *   Pascal Felber <pascal.felber@unine.ch>
 * @date
 *   2007-2009
 */

#ifndef _MOD_LOCAL_H_
# define _MOD_LOCAL_H_

# include "stm.h"

# ifdef __cplusplus
extern "C" {
# endif

/**
 * Transaction-local store of a word-sized value.
 *
 * @param addr
 *   Address of the memory location.
 * @param value
 *   Value to be written.
 */
void stm_store_local(TXPARAMS stm_word_t *addr, stm_word_t value);

/**
 * Transaction-local store of a char value.
 *
 * @param addr
 *   Address of the memory location.
 * @param value
 *   Value to be written.
 */
void stm_store_local_char(TXPARAMS char *addr, char value);

/**
 * Transaction-local store of an unsigned char value.
 *
 * @param addr
 *   Address of the memory location.
 * @param value
 *   Value to be written.
 */
void stm_store_local_uchar(TXPARAMS unsigned char *addr, unsigned char value);

/**
 * Transaction-local store of a short value.
 *
 * @param addr
 *   Address of the memory location.
 * @param value
 *   Value to be written.
 */
void stm_store_local_short(TXPARAMS short *addr, short value);

/**
 * Transaction-local store of an unsigned short value.
 *
 * @param addr
 *   Address of the memory location.
 * @param value
 *   Value to be written.
 */
void stm_store_local_ushort(TXPARAMS unsigned short *addr, unsigned short value);

/**
 * Transaction-local store of an int value.
 *
 * @param addr
 *   Address of the memory location.
 * @param value
 *   Value to be written.
 */
void stm_store_local_int(TXPARAMS int *addr, int value);

/**
 * Transaction-local store of an unsigned int value.
 *
 * @param addr
 *   Address of the memory location.
 * @param value
 *   Value to be written.
 */
void stm_store_local_uint(TXPARAMS unsigned int *addr, unsigned int value);

/**
 * Transaction-local store of a long value.
 *
 * @param addr
 *   Address of the memory location.
 * @param value
 *   Value to be written.
 */
void stm_store_local_long(TXPARAMS long *addr, long value);

/**
 * Transaction-local store of an unsigned long value.
 *
 * @param addr
 *   Address of the memory location.
 * @param value
 *   Value to be written.
 */
void stm_store_local_ulong(TXPARAMS unsigned long *addr, unsigned long value);

/**
 * Transaction-local store of a float value.
 *
 * @param addr
 *   Address of the memory location.
 * @param value
 *   Value to be written.
 */
void stm_store_local_float(TXPARAMS float *addr, float value);

/**
 * Transaction-local store of a double value.
 *
 * @param addr
 *   Address of the memory location.
 * @param value
 *   Value to be written.
 */
void stm_store_local_double(TXPARAMS double *addr, double value);

/**
 * Transaction-local store of a pointer value.
 *
 * @param addr
 *   Address of the memory location.
 * @param value
 *   Value to be written.
 */
void stm_store_local_ptr(TXPARAMS void **addr, void *value);

/**
 * Initialize the module.  This function must be called once, from the
 * main thread, after initializing the STM library and before
 * performing any transactional operation.
 */
void mod_local_init();

# ifdef __cplusplus
}
# endif

#endif /* _MOD_LOCAL_H_ */


================================================
FILE: stms/estm-0.3.0/include/mod_mem.h
================================================
/*
 * File:
 *   mod_mem.h
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 * Description:
 *   Module for dynamic memory management.
 *
 * Copyright (c) 2007-2009.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */

/**
 * @file
 *   Module for dynamic memory management.  This module provides
 *   functions for allocations and freeing memory inside transactions.
 *   A block allocated inside the transaction will be implicitly freed
 *   upon abort, and a block freed inside a transaction will only be
 *   returned to the system upon commit.
 * @author
 *   Pascal Felber <pascal.felber@unine.ch>
 * @date
 *   2007-2009
 */

#ifndef _MOD_MEM_H_
# define _MOD_MEM_H_

# include "stm.h"

# ifdef __cplusplus
extern "C" {
# endif

/**
 * Allocate memory from inside a transaction.  Allocated memory is
 * implicitly freed upon abort.
 *
 * @param size
 *   Number of bytes to allocate.
 * @return
 *   Pointer to the allocated memory block.
 */
void *stm_malloc(TXPARAMS size_t size);

/**
 * Free memory from inside a transaction.  Freed memory is only returned
 * to the system upon commit and can optionally be overwritten (more
 * precisely, the locks protecting the memory are acquired) to prevent
 * another transaction from accessing the freed memory and observe
 * inconsistent states.
 *
 * @param addr
 *   Address of the memory block.
 * @param size
 *   Number of bytes to overwrite.
 */
void stm_free(TXPARAMS void *addr, size_t size);

/**
 * Free memory from inside a transaction.  Freed memory is only returned
 * to the system upon commit and can optionally be overwritten (more
 * precisely, the locks protecting the memory are acquired) to prevent
 * another transaction from accessing the freed memory and observe
 * inconsistent states.
 *
 * @param addr
 *   Address of the memory block.
 * @param idx
 *   Index of the first byte to overwrite.
 * @param size
 *   Number of bytes to overwrite.
 */
void stm_free2(TXPARAMS void *addr, size_t idx, size_t size);

/**
 * Initialize the module.  This function must be called once, from the
 * main thread, after initializing the STM library and before
 * performing any transactional operation.
 */
void mod_mem_init();

# ifdef __cplusplus
}
# endif

#endif /* _MOD_MEM_H_ */


================================================
FILE: stms/estm-0.3.0/include/mod_print.h
================================================
/*
 * File:
 *   mod_print.h
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 * Description:
 *   Module to test callbacks.
 *
 * Copyright (c) 2007-2009.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */

/**
 * @file
 *   Module to test callbacks.  This module simply prints a message at
 *   each invocation of a callback.
 * @author
 *   Pascal Felber <pascal.felber@unine.ch>
 * @date
 *   2007-2009
 */

#ifndef _MOD_PRINT_H_
# define _MOD_PRINT_H_

# ifdef __cplusplus
extern "C" {
# endif

/**
 * Initialize the module.  This function must be called once, from the
 * main thread, after initializing the STM library and before
 * performing any transactional operation.
 */
void mod_print_init();

# ifdef __cplusplus
}
# endif

#endif /* _MOD_PRINT_H_ */


================================================
FILE: stms/estm-0.3.0/include/mod_stats.h
================================================
/*
 * File:
 *   mod_stats.h
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 * Description:
 *   Module for gathering statistics about transactions.
 *
 * Copyright (c) 2007-2009.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */

/**
 * @file
 *   Module for gathering statistics about transactions.  This module
 *   maintain both aggregate statistics about all threads (aggregates
 *   are updated upon thread cleanup) and per-thread statistics.  The
 *   built-in statistics of the core STM library are more efficient and
 *   detailed but this module is useful in case the library is compiled
 *   without support for statistics.
 * @author
 *   Pascal Felber <pascal.felber@unine.ch>
 * @date
 *   2007-2009
 */

#ifndef _MOD_STATS_H_
# define _MOD_STATS_H_

# include "stm.h"

# ifdef __cplusplus
extern "C" {
# endif

/**
 * Get various statistics about the transactions of all threads.  See
 * the source code (mod_stats.c) for a list of supported statistics.
 *
 * @param name
 *   Name of the statistics.
 * @param val
 *   Pointer to the variable that should hold the value of the
 *   statistics.
 * @return
 *   1 upon success, 0 otherwise.
 */
int stm_get_global_stats(const char *name, void *val);

/**
 * Get various statistics about the transactions of the current thread.
 * See the source code (mod_stats.c) for a list of supported statistics.
 *
 * @param name
 *   Name of the statistics.
 * @param val
 *   Pointer to the variable that should hold the value of the
 *   statistics.
 * @return
 *   1 upon success, 0 otherwise.
 */
int stm_get_local_stats(TXPARAMS const char *name, void *val);

/**
 * Initialize the module.  This function must be called once, from the
 * main thread, after initializing the STM library and before
 * performing any transactional operation.
 */
void mod_stats_init();

# ifdef __cplusplus
}
# endif

#endif /* _MOD_STATS_H_ */


================================================
FILE: stms/estm-0.3.0/include/stm.h
================================================
/*
 * File:
 *   stm.h
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Vincent Gramoli <vincent.gramoli@epfl.ch>
 * Description:
 *   STM functions.
 *
 * Copyright (c) 2007-2009.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */

/**
 * @file
 *   STM functions.  This library contains the core functions for
 *   programming with E-STM.
 * @author
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Vincent Gramoli <vincent.gramoli@epfl.ch>
 * @date
 *   2007-2009
 */

/**
 * @mainpage E-STM
 *
 * @section overview_sec Overview
 *
 *   E-STM is the first STM that supports elastic transactions, a new 
 *   form of transactions that enhance concurrency. This distribution
 *   includes both normal transactions and elastic transactions.
 *   The normal transactions build upon TinySTM and support the same 
 *   parameterizing options: write-back (lazy update), write-through 
 *   (eager update), encounter-time locking (eager-acquirement), 
 *   commit-time locking (lazy acquirement).
 *   TinySTM: write-back (updates are buffered until commit time),
 *   write-through (updates are directly written to memory), and
 *   commit-time locking (locks are only acquired upon commit).  The
 *   version can be selected by editing the makefile, which documents
 *   all the different compilation options.
 *
 *   E-STM compiles and runs on 32 or 64-bit architectures.
 *   Tested platforms are 
 *    - SPARC Niagara 2 running SUN OS 5.10, 
 *    - 1.6 Ghz Intel Core 2 Duo running Mac OS X 10.5.6, 
 *    - dual quad-core Intel Xeon Server x500 series running Linux,
 *    - four quad-core AMD Opteron running Linux,
 *    - eight core Opteron running Linux.
 *
 * @section install_sec Installation
 *
 *   TinySTM requires the atomic_ops library, freely available from
 *   http://www.hpl.hp.com/research/linux/atomic_ops/.  The environment
 *   variable <c>LIBAO_HOME</c> must be set to the installation
 *   directory of atomic_ops.
 *
 *   If your system does not support GCC thread-local storage, set the
 *   environment variable <c>NOTLS</c> to a non-empty value before
 *   compilation.
 *
 *   To compile TinySTM libraries, execute <c>make</c> in the main
 *   directory.  To compile test applications, execute <c>make test</c>.
 *
 * @section contact_sec Contact
 *
 *   - E-mail : vincent.gramoli@epfl.ch
 *   - Web    : http://lpd.epfl.ch/gramoli/php/estm
 */

#ifndef _STM_H_
# define _STM_H_

# include <setjmp.h>
# include <stdint.h>
# include <stdio.h>
# include <stdlib.h>

# ifdef __cplusplus
extern "C" {
# endif

/*
 * The library does not require to pass the current transaction as a
 * parameter to the functions (the current transaction is stored in a
 * thread-local variable).  One can, however, compile the library with
 * explicit transaction parameters.  This is useful, for instance, for
 * performance on architectures that do not support TLS or for easier
 * compiler integration.
 */
# ifdef EXPLICIT_TX_PARAMETER
struct stm_tx;
#  define TXTYPE                        struct stm_tx *
#  define TXPARAM                       struct stm_tx *tx
#  define TXPARAMS                      struct stm_tx *tx,
#  define TXARG                         (struct stm_tx *)tx
#  define TXARGS                        (struct stm_tx *)tx,
struct stm_tx *stm_current_tx();
# else /* ! EXPLICIT_TX_PARAMETER */
#  define TXTYPE                        void
#  define TXPARAM                       /* Nothing */
#  define TXPARAMS                      /* Nothing */
#  define TXARG                         /* Nothing */
#  define TXARGS                        /* Nothing */
#endif /* ! EXPLICIT_TX_PARAMETER */
		
#define EL								0
#define NL								1
	
/* ################################################################### *
 * TYPES
 * ################################################################### */

/**
 * Size of a word (accessible atomically) on the target architecture.
 * The library supports 32-bit and 64-bit architectures.
 */
typedef uintptr_t stm_word_t;

/**
 * Transaction attributes specified by the application.
 */
typedef struct stm_tx_attr {
  /**
   * Application-specific identifier for the transaction.  Typically,
   * each transactional construct (atomic block) should have a different
   * identifier.  This identifier can be used by the infrastructure for
   * improving performance, for instance by not scheduling together
   * atomic blocks that have conflicted often in the past.
   */
  int id;
  /**
   * Indicates whether the transaction is read-only.  This information
   * is used as a hint.  If a read-only transaction performs a write, it
   * is aborted and restarted in read-write mode.  In that case, the
   * value of the read-only flag is changed to false.
   */
  int ro;
} stm_tx_attr_t;

/* ################################################################### *
 * FUNCTIONS
 * ################################################################### */

/**
 * Initialize the STM library.  This function must be called once, from
 * the main thread, before any access to the other functions of the
 * library.
 */
void stm_init();

/**
 * Clean up the STM library.  This function must be called once, from
 * the main thread, after all transactional threads have completed.
 */
void stm_exit();

/**
 * Initialize a transactional thread.  This function must be called once
 * from each thread that performs transactional operations, before the
 * thread calls any other functions of the library.
 */
TXTYPE stm_init_thread();

/**
 * Clean up a transactional thread.  This function must be called once
 * from each thread that performs transactional operations, upon exit.
 */
void stm_exit_thread(TXPARAM);
	
/**
 * Start a transaction. The transaction can be either of two types, 
 * elastic or normal. If type is null, the transaction is normal.
 *
 * @param env
 *   Specifies the environment (stack context) to be used to jump back
 *   upon abort.  If null, the transaction will continue even after
 *   abort and the application should explicitely check its status.  If
 *   the transaction is nested, this parameter is ignored as an abort
 *   will restart the top-level transaction (flat nesting).
 * @param attr
 *   Specifies optional attributes associated to the transaction.  If
 *   null, the transaction uses default attributes.
 * @param type
 *   Specifies the type of the transaction. If null, the transaction is 
 *   normal.
 */
void stm_start(TXPARAMS sigjmp_buf *env, stm_tx_attr_t *attr, int type);	
	
/**
 * Try to commit the current transaction.  If successful, the function 
 * returns 1. Otherwise, execution continues at the point specified by 
 * the environment passed as parameter to stm_start() (for the outermost
 * transaction upon nesting).  If the environment was null, the function
 * returns 0 if commit is unsuccessful.
 */
int stm_commit(TXPARAM);
	
/**
 * Explicitly abort the transaction.  Execution continues at the point
 * specified by the environment passed as parameter to stm_start() (for
 * the outermost transaction upon nesting), unless the environment was
 * null.
 */
void stm_abort(TXPARAM);
	
/**
 * Transactional load whose execution depends on the current transaction 
 * type, elastic or normal:
 * 
 * Elastic transactional load.  Read the specified memory location in the
 * context of the current transaction and return its value. If not 
 * preceded by a write operation in the same transaction, this operation
 * success depends only on the previous read operation (if it exists).
 * For further details about cases where an elastic transaction aborts
 * see the EPFL technical report about Elastic Transactions,
 * #LPD-REPORT-2009-002.
 *
 * Normal transactional load.  Read the specified memory location in the
 * context of the current transaction and return its value. The 
 * transaction may abort while reading the memory location. Note that the 
 * value returned is consistent with respect to previous reads from the 
 * same transaction.
 *
 * @param addr
 *   Address of the memory location.
 * @return
 *   Value read from the specified address.
 */
stm_word_t stm_load(TXPARAMS volatile stm_word_t *addr);
	
/**
 * Transactional store whose execution depends on the current transaction 
 * type, elastic or normal:
 *
 * Elastic transactional store.  Write a word-sized value to the 
 * specified memory location in the context of the current transaction.  
 * If this operation is the first write operation of the current 
 * transaction, then its success depends only on the previous read 
 * operation that occurred in the same transaction. See the EPFL 
 * technical report about Elastic Transactions #LPD-REPORT-2009-002,
 * for further details.
 *
 * Normal transactional store.  Write a word-sized value to the 
 * specified memory location in the context of the current transaction.  
 * The transaction may abort while writing to the memory location.
 *
 * @param addr
 *   Address of the memory location.
 * @param value
 *   Value to be written.
 */
void stm_store(TXPARAMS volatile stm_word_t *addr, stm_word_t value);

/**
 * Transactional store, writes a value to the specified memory location
 * in the context of the current transaction.  The value may be smaller
 * than a word on the target architecture, in which case a mask is used
 * to indicate the bits of the words that must be updated.  
 *
 * @param addr
 *   Address of the memory location.
 * @param value
 *   Value to be written.
 * @param mask
 *   Mask specifying the bits to be written.
 */
void stm_store2(TXPARAMS volatile stm_word_t *addr, stm_word_t value, stm_word_t mask);

/**
 * Check if the current transaction is still active.
 *
 * @return
 *   True (non-zero) if the transaction is active, false (zero) otherwise.
 */
int stm_active(TXPARAM);

/**
 * Check if the current transaction has aborted.
 *
 * @return
 *   True (non-zero) if the transaction has aborted, false (zero) otherwise.
 */
int stm_aborted(TXPARAM);

/**
 * Get the environment used by the current thread to jump back upon
 * abort.  This environment should be used when calling sigsetjmp()
 * before starting the transaction and passed as parameter to
 * stm_start().  If the current thread is already executing a
 * transaction, i.e., the new transaction will be nested, the function
 * returns NULL and one should not call sigsetjmp().
 *
 * @return
 *   The environment to use for saving the stack context, or NULL if the
 *   transaction is nested.
 */
sigjmp_buf *stm_get_env(TXPARAM);

/**
 * Get attributes associated with the current transactions, if any.
 * These attributes were passed as parameters when starting the
 * transaction.
 *
 * @return Attributes associated with the current transaction, or NULL
 *   if no attributes were specified when starting the transaction.
 */
stm_tx_attr_t *stm_get_attributes(TXPARAM);

/**
 * Get various statistics about the current thread/transaction.  See the
 * source code (stm.c) for a list of supported statistics.
 *
 * @param name
 *   Name of the statistics.
 * @param val
 *   Pointer to the variable that should hold the value of the
 *   statistics.
 * @return
 *   1 upon success, 0 otherwise.
 */
int stm_get_stats(TXPARAMS const char *name, void *val);

/**
 * Get various parameters of the STM library.  See the source code
 * (stm.c) for a list of supported parameters.
 *
 * @param name
 *   Name of the parameter.
 * @param val
 *   Pointer to the variable that should hold the value of the
 *   parameter.
 * @return
 *   1 upon success, 0 otherwise.
 */
int stm_get_parameter(const char *name, void *val);

/**
 * Set various parameters of the STM library.  See the source code
 * (stm.c) for a list of supported parameters.
 *
 * @param name
 *   Name of the parameter.
 * @param val
 *   Pointer to a variable that holds the new value of the parameter.
 * @return
 *   1 upon success, 0 otherwise.
 */
int stm_set_parameter(const char *name, void *val);

/**
 * Create a key to associate application-specific data to the current
 * thread/transaction.  This mechanism can be combined with callbacks to
 * write modules.
 *
 * @return
 *   The new key.
 */
int stm_create_specific();

/**
 * Get application-specific data associated to the current
 * thread/transaction and a given key.
 *
 * @param key
 *   Key designating the data to read.
 * @return
 *   Data stored under the given key.
 */
void *stm_get_specific(TXPARAMS int key);

/**
 * Set application-specific data associated to the current
 * thread/transaction and a given key.
 *
 * @param key
 *   Key designating the data to read.
 * @param data
 *   Data to store under the given key.
 */
void stm_set_specific(TXPARAMS int key, void *data);

/**
 * Register application-specific callbacks that are triggered when
 * particular events occur.
 *
 * @param on_thread_init
 *   Function called upon initialization of a transactional thread.
 * @param on_thread_exit
 *   Function called upon cleanup of a transactional thread.
 * @param on_start
 *   Function called upon start of a transaction.
 * @param on_commit
 *   Function called upon successful transaction commit.
 * @param on_abort
 *   Function called upon transaction abort.
 * @param arg
 *   Parameter to be passed to the callback functions.
 * @return
 *   1 if the callbacks have been successfully registered, 0 otherwise.
 */
int stm_register(void (*on_thread_init)(TXPARAMS void *arg),
                 void (*on_thread_exit)(TXPARAMS void *arg),
                 void (*on_start)(TXPARAMS void *arg),
                 void (*on_commit)(TXPARAMS void *arg),
                 void (*on_abort)(TXPARAMS void *arg),
                 void *arg);

/**
 * Read the current value of the global clock (used for timestamps).
 * This function is useful when programming with unit loads and stores.
 *
 * @return
 *   Value of the global clock.
 */
stm_word_t stm_get_clock();

#ifdef __cplusplus
}
#endif

#endif /* _STM_H_ */


================================================
FILE: stms/estm-0.3.0/include/wrappers.h
================================================
/*
 * File:
 *   wrappers.h
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 * Description:
 *   STM wrapper functions for different data types.
 *
 * Copyright (c) 2007-2009.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */

/**
 * @file
 *   STM wrapper functions for different data types.  This library
 *   defines transactional loads/store functions for unsigned data types
 *   of various sizes and for basic C data types.
 * @author
 *   Pascal Felber <pascal.felber@unine.ch>
 * @date
 *   2007-2009
 */

#ifndef _WRAPPERS_H_
# define _WRAPPERS_H_

# include <stdint.h>

# include "stm.h"

# ifdef __cplusplus
extern "C" {
# endif

/**
 * Transactional load of an unsigned 8-bit value.
 *
 * @param addr
 *   Address of the memory location.
 * @return
 *   Value read from the specified address.
 */
uint8_t stm_load8(TXPARAMS volatile uint8_t *addr);

/**
 * Transactional load of an unsigned 16-bit value.
 *
 * @param addr
 *   Address of the memory location.
 * @return
 *   Value read from the specified address.
 */
uint16_t stm_load16(TXPARAMS volatile uint16_t *addr);

/**
 * Transactional load of an unsigned 32-bit value.
 *
 * @param addr
 *   Address of the memory location.
 * @return
 *   Value read from the specified address.
 */
uint32_t stm_load32(TXPARAMS volatile uint32_t *addr);

/**
 * Transactional load of an unsigned 64-bit value.
 *
 * @param addr
 *   Address of the memory location.
 * @return
 *   Value read from the specified address.
 */
uint64_t stm_load64(TXPARAMS volatile uint64_t *addr);

/**
 * Transactional load of a char value.
 *
 * @param addr
 *   Address of the memory location.
 * @return
 *   Value read from the specified address.
 */
char stm_load_char(TXPARAMS volatile char *addr);

/**
 * Transactional load of an unsigned char value.
 *
 * @param addr
 *   Address of the memory location.
 * @return
 *   Value read from the specified address.
 */
unsigned char stm_load_uchar(TXPARAMS volatile unsigned char *addr);

/**
 * Transactional load of a short value.
 *
 * @param addr
 *   Address of the memory location.
 * @return
 *   Value read from the specified address.
 */
short stm_load_short(TXPARAMS volatile short *addr);

/**
 * Transactional load of an unsigned short value.
 *
 * @param addr
 *   Address of the memory location.
 * @return
 *   Value read from the specified address.
 */
unsigned short stm_load_ushort(TXPARAMS volatile unsigned short *addr);

/**
 * Transactional load of an int value.
 *
 * @param addr
 *   Address of the memory location.
 * @return
 *   Value read from the specified address.
 */
int stm_load_int(TXPARAMS volatile int *addr);

/**
 * Transactional load of an unsigned int value.
 *
 * @param addr
 *   Address of the memory location.
 * @return
 *   Value read from the specified address.
 */
unsigned int stm_load_uint(TXPARAMS volatile unsigned int *addr);

/**
 * Transactional load of a long value.
 *
 * @param addr
 *   Address of the memory location.
 * @return
 *   Value read from the specified address.
 */
long stm_load_long(TXPARAMS volatile long *addr);

/**
 * Transactional load of an unsigned long value.
 *
 * @param addr
 *   Address of the memory location.
 * @return
 *   Value read from the specified address.
 */
unsigned long stm_load_ulong(TXPARAMS volatile unsigned long *addr);

/**
 * Transactional load of a float value.
 *
 * @param addr
 *   Address of the memory location.
 * @return
 *   Value read from the specified address.
 */
float stm_load_float(TXPARAMS volatile float *addr);

/**
 * Transactional load of a double value.
 *
 * @param addr
 *   Address of the memory location.
 * @return
 *   Value read from the specified address.
 */
double stm_load_double(TXPARAMS volatile double *addr);

/**
 * Transactional load of a pointer value.
 *
 * @param addr
 *   Address of the memory location.
 * @return
 *   Value read from the specified address.
 */
void *stm_load_ptr(TXPARAMS volatile void **addr);

/**
 * Transactional load of a memory region.  The address of the region
 * does not need to be word aligned and its size may be longer than a
 * word.  The values are copied into the provided buffer, which must be
 * allocated by the caller.
 *
 * @param addr
 *   Address of the memory location.
 * @param buf
 *   Buffer for storing the read bytes.
 * @param size
 *   Number of bytes to read.
 */
void stm_load_bytes(TXPARAMS volatile uint8_t *addr, uint8_t *buf, size_t size);

/**
 * Transactional store of an unsigned 8-bit value.
 *
 * @param addr
 *   Address of the memory location.
 * @param value
 *   Value to be written.
 */
void stm_store8(TXPARAMS volatile uint8_t *addr, uint8_t value);

/**
 * Transactional store of an unsigned 16-bit value.
 *
 * @param addr
 *   Address of the memory location.
 * @param value
 *   Value to be written.
 */
void stm_store16(TXPARAMS volatile uint16_t *addr, uint16_t value);

/**
 * Transactional store of an unsigned 32-bit value.
 *
 * @param addr
 *   Address of the memory location.
 * @param value
 *   Value to be written.
 */
void stm_store32(TXPARAMS volatile uint32_t *addr, uint32_t value);

/**
 * Transactional store of an unsigned 64-bit value.
 *
 * @param addr
 *   Address of the memory location.
 * @param value
 *   Value to be written.
 */
void stm_store64(TXPARAMS volatile uint64_t *addr, uint64_t value);

/**
 * Transactional store of a char value.
 *
 * @param addr
 *   Address of the memory location.
 * @param value
 *   Value to be written.
 */
void stm_store_char(TXPARAMS volatile char *addr, char value);

/**
 * Transactional store of an unsigned char value.
 *
 * @param addr
 *   Address of the memory location.
 * @param value
 *   Value to be written.
 */
void stm_store_uchar(TXPARAMS volatile unsigned char *addr, unsigned char value);

/**
 * Transactional store of a short value.
 *
 * @param addr
 *   Address of the memory location.
 * @param value
 *   Value to be written.
 */
void stm_store_short(TXPARAMS volatile short *addr, short value);

/**
 * Transactional store of an unsigned short value.
 *
 * @param addr
 *   Address of the memory location.
 * @param value
 *   Value to be written.
 */
void stm_store_ushort(TXPARAMS volatile unsigned short *addr, unsigned short value);

/**
 * Transactional store of an int value.
 *
 * @param addr
 *   Address of the memory location.
 * @param value
 *   Value to be written.
 */
void stm_store_int(TXPARAMS volatile int *addr, int value);

/**
 * Transactional store of an unsigned int value.
 *
 * @param addr
 *   Address of the memory location.
 * @param value
 *   Value to be written.
 */
void stm_store_uint(TXPARAMS volatile unsigned int *addr, unsigned int value);

/**
 * Transactional store of a long value.
 *
 * @param addr
 *   Address of the memory location.
 * @param value
 *   Value to be written.
 */
void stm_store_long(TXPARAMS volatile long *addr, long value);

/**
 * Transactional store of an unsigned long value.
 *
 * @param addr
 *   Address of the memory location.
 * @param value
 *   Value to be written.
 */
void stm_store_ulong(TXPARAMS volatile unsigned long *addr, unsigned long value);

/**
 * Transactional store of a float value.
 *
 * @param addr
 *   Address of the memory location.
 * @param value
 *   Value to be written.
 */
void stm_store_float(TXPARAMS volatile float *addr, float value);

/**
 * Transactional store of a double value.
 *
 * @param addr
 *   Address of the memory location.
 * @param value
 *   Value to be written.
 */
void stm_store_double(TXPARAMS volatile double *addr, double value);

/**
 * Transactional store of a pointer value.
 *
 * @param addr
 *   Address of the memory location.
 * @param value
 *   Value to be written.
 */
void stm_store_ptr(TXPARAMS volatile void **addr, void *value);

/**
 * Transactional store of a memory region.  The address of the region
 * does not need to be word aligned and its size may be longer than a
 * word.  The values are copied from the provided buffer.
 *
 * @param addr
 *   Address of the memory location.
 * @param buf
 *   Buffer with the bytes to write.
 * @param size
 *   Number of bytes to write.
 */
void stm_store_bytes(TXPARAMS volatile uint8_t *addr, uint8_t *buf, size_t size);

# ifdef __cplusplus
}
# endif

#endif /* _WRAPPERS_H_ */


================================================
FILE: stms/estm-0.3.0/src/atomic.h
================================================
/*
 * File:
 *   atomic.h
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 * Description:
 *   Atomic operations.
 *
 * Copyright (c) 2007-2009.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */

#ifndef _ATOMIC_H_
# define _ATOMIC_H_

# include <atomic_ops.h>

typedef AO_t atomic_t;

# ifdef NO_AO
/* Use only for testing purposes (single thread benchmarks) */
#  define ATOMIC_CAS_FULL(a, e, v)      (*(a) = (v), 1)
#  define ATOMIC_FETCH_INC_FULL(a)      ((*(a))++)
#  define ATOMIC_FETCH_DEC_FULL(a)      ((*(a))--)
#  define ATOMIC_FETCH_ADD_FULL(a, v)   ((*(a)) += (v))
#  define ATOMIC_LOAD_ACQ(a)            (*(a))
#  define ATOMIC_LOAD(a)                (*(a))
#  define ATOMIC_STORE_REL(a, v)        (*(a) = (v))
#  define ATOMIC_STORE(a, v)            (*(a) = (v))
#  define ATOMIC_MB_READ                /* Nothing */
#  define ATOMIC_MB_WRITE               /* Nothing */
#  define ATOMIC_MB_FULL                /* Nothing */
# else /* ! NO_AO */
#  define ATOMIC_CAS_FULL(a, e, v)      (AO_compare_and_swap_full((volatile AO_t *)(a), (AO_t)(e), (AO_t)(v)))
#  define ATOMIC_FETCH_INC_FULL(a)      (AO_fetch_and_add1_full((volatile AO_t *)(a)))
#  define ATOMIC_FETCH_DEC_FULL(a)      (AO_fetch_and_sub1_full((volatile AO_t *)(a)))
#  define ATOMIC_FETCH_ADD_FULL(a, v)   (AO_fetch_and_add_full((volatile AO_t *)(a), (AO_t)(v)))
#  ifdef SAFE
#   define ATOMIC_LOAD_ACQ(a)           (AO_load_full((volatile AO_t *)(a)))
#   define ATOMIC_LOAD(a)               (AO_load_full((volatile AO_t *)(a)))
#   define ATOMIC_STORE_REL(a, v)       (AO_store_full((volatile AO_t *)(a), (AO_t)(v)))
#   define ATOMIC_STORE(a, v)           (AO_store_full((volatile AO_t *)(a), (AO_t)(v)))
#   define ATOMIC_MB_READ               AO_nop_full()
#   define ATOMIC_MB_WRITE              AO_nop_full()
#   define ATOMIC_MB_FULL               AO_nop_full()
#  else /* ! SAFE */
#   define ATOMIC_LOAD_ACQ(a)           (AO_load_acquire_read((volatile AO_t *)(a)))
#   define ATOMIC_LOAD(a)               (*((volatile AO_t *)(a)))
#   define ATOMIC_STORE_REL(a, v)       (AO_store_release((volatile AO_t *)(a), (AO_t)(v)))
#   define ATOMIC_STORE(a, v)           (*((volatile AO_t *)(a)) = (AO_t)(v))
#   define ATOMIC_MB_READ               AO_nop_read()
#   define ATOMIC_MB_WRITE              AO_nop_write()
#   define ATOMIC_MB_FULL               AO_nop_full()
#  endif /* ! SAFE */
# endif /* ! NO_AO */

#endif /* _ATOMIC_H_ */


================================================
FILE: stms/estm-0.3.0/src/atomic_ops/AUTHORS
================================================
Originally written by Hans Boehm, with some platform-dependent code
imported from the Boehm-Demers-Weiser GC, where it was contributed
by many others.


================================================
FILE: stms/estm-0.3.0/src/atomic_ops/COPYING
================================================
		    GNU GENERAL PUBLIC LICENSE
		       Version 2, June 1991

 Copyright (C) 1989, 1991 Free Software Foundation, Inc.
     59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 Everyone is permitted to copy and distribute verbatim copies
 of this license document, but changing it is not allowed.

			    Preamble

  The licenses for most software are designed to take away your
freedom to share and change it.  By contrast, the GNU General Public
License is intended to guarantee your freedom to share and change free
software--to make sure the software is free for all its users.  This
General Public License applies to most of the Free Software
Foundation's software and to any other program whose authors commit to
using it.  (Some other Free Software Foundation software is covered by
the GNU Library General Public License instead.)  You can apply it to
your programs, too.

  When we speak of free software, we are referring to freedom, not
price.  Our General Public Licenses are designed to make sure that you
have the freedom to distribute copies of free software (and charge for
this service if you wish), that you receive source code or can get it
if you want it, that you can change the software or use pieces of it
in new free programs; and that you know you can do these things.

  To protect your rights, we need to make restrictions that forbid
anyone to deny you these rights or to ask you to surrender the rights.
These restrictions translate to certain responsibilities for you if you
distribute copies of the software, or if you modify it.

  For example, if you distribute copies of such a program, whether
gratis or for a fee, you must give the recipients all the rights that
you have.  You must make sure that they, too, receive or can get the
source code.  And you must show them these terms so they know their
rights.

  We protect your rights with two steps: (1) copyright the software, and
(2) offer you this license which gives you legal permission to copy,
distribute and/or modify the software.

  Also, for each author's protection and ours, we want to make certain
that everyone understands that there is no warranty for this free
software.  If the software is modified by someone else and passed on, we
want its recipients to know that what they have is not the original, so
that any problems introduced by others will not reflect on the original
authors' reputations.

  Finally, any free program is threatened constantly by software
patents.  We wish to avoid the danger that redistributors of a free
program will individually obtain patent licenses, in effect making the
program proprietary.  To prevent this, we have made it clear that any
patent must be licensed for everyone's free use or not licensed at all.

  The precise terms and conditions for copying, distribution and
modification follow.

		    GNU GENERAL PUBLIC LICENSE
   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION

  0. This License applies to any program or other work which contains
a notice placed by the copyright holder saying it may be distributed
under the terms of this General Public License.  The "Program", below,
refers to any such program or work, and a "work based on the Program"
means either the Program or any derivative work under copyright law:
that is to say, a work containing the Program or a portion of it,
either verbatim or with modifications and/or translated into another
language.  (Hereinafter, translation is included without limitation in
the term "modification".)  Each licensee is addressed as "you".

Activities other than copying, distribution and modification are not
covered by this License; they are outside its scope.  The act of
running the Program is not restricted, and the output from the Program
is covered only if its contents constitute a work based on the
Program (independent of having been made by running the Program).
Whether that is true depends on what the Program does.

  1. You may copy and distribute verbatim copies of the Program's
source code as you receive it, in any medium, provided that you
conspicuously and appropriately publish on each copy an appropriate
copyright notice and disclaimer of warranty; keep intact all the
notices that refer to this License and to the absence of any warranty;
and give any other recipients of the Program a copy of this License
along with the Program.

You may charge a fee for the physical act of transferring a copy, and
you may at your option offer warranty protection in exchange for a fee.

  2. You may modify your copy or copies of the Program or any portion
of it, thus forming a work based on the Program, and copy and
distribute such modifications or work under the terms of Section 1
above, provided that you also meet all of these conditions:

    a) You must cause the modified files to carry prominent notices
    stating that you changed the files and the date of any change.

    b) You must cause any work that you distribute or publish, that in
    whole or in part contains or is derived from the Program or any
    part thereof, to be licensed as a whole at no charge to all third
    parties under the terms of this License.

    c) If the modified program normally reads commands interactively
    when run, you must cause it, when started running for such
    interactive use in the most ordinary way, to print or display an
    announcement including an appropriate copyright notice and a
    notice that there is no warranty (or else, saying that you provide
    a warranty) and that users may redistribute the program under
    these conditions, and telling the user how to view a copy of this
    License.  (Exception: if the Program itself is interactive but
    does not normally print such an announcement, your work based on
    the Program is not required to print an announcement.)

These requirements apply to the modified work as a whole.  If
identifiable sections of that work are not derived from the Program,
and can be reasonably considered independent and separate works in
themselves, then this License, and its terms, do not apply to those
sections when you distribute them as separate works.  But when you
distribute the same sections as part of a whole which is a work based
on the Program, the distribution of the whole must be on the terms of
this License, whose permissions for other licensees extend to the
entire whole, and thus to each and every part regardless of who wrote it.

Thus, it is not the intent of this section to claim rights or contest
your rights to work written entirely by you; rather, the intent is to
exercise the right to control the distribution of derivative or
collective works based on the Program.

In addition, mere aggregation of another work not based on the Program
with the Program (or with a work based on the Program) on a volume of
a storage or distribution medium does not bring the other work under
the scope of this License.

  3. You may copy and distribute the Program (or a work based on it,
under Section 2) in object code or executable form under the terms of
Sections 1 and 2 above provided that you also do one of the following:

    a) Accompany it with the complete corresponding machine-readable
    source code, which must be distributed under the terms of Sections
    1 and 2 above on a medium customarily used for software interchange; or,

    b) Accompany it with a written offer, valid for at least three
    years, to give any third party, for a charge no more than your
    cost of physically performing source distribution, a complete
    machine-readable copy of the corresponding source code, to be
    distributed under the terms of Sections 1 and 2 above on a medium
    customarily used for software interchange; or,

    c) Accompany it with the information you received as to the offer
    to distribute corresponding source code.  (This alternative is
    allowed only for noncommercial distribution and only if you
    received the program in object code or executable form with such
    an offer, in accord with Subsection b above.)

The source code for a work means the preferred form of the work for
making modifications to it.  For an executable work, complete source
code means all the source code for all modules it contains, plus any
associated interface definition files, plus the scripts used to
control compilation and installation of the executable.  However, as a
special exception, the source code distributed need not include
anything that is normally distributed (in either source or binary
form) with the major components (compiler, kernel, and so on) of the
operating system on which the executable runs, unless that component
itself accompanies the executable.

If distribution of executable or object code is made by offering
access to copy from a designated place, then offering equivalent
access to copy the source code from the same place counts as
distribution of the source code, even though third parties are not
compelled to copy the source along with the object code.

  4. You may not copy, modify, sublicense, or distribute the Program
except as expressly provided under this License.  Any attempt
otherwise to copy, modify, sublicense or distribute the Program is
void, and will automatically terminate your rights under this License.
However, parties who have received copies, or rights, from you under
this License will not have their licenses terminated so long as such
parties remain in full compliance.

  5. You are not required to accept this License, since you have not
signed it.  However, nothing else grants you permission to modify or
distribute the Program or its derivative works.  These actions are
prohibited by law if you do not accept this License.  Therefore, by
modifying or distributing the Program (or any work based on the
Program), you indicate your acceptance of this License to do so, and
all its terms and conditions for copying, distributing or modifying
the Program or works based on it.

  6. Each time you redistribute the Program (or any work based on the
Program), the recipient automatically receives a license from the
original licensor to copy, distribute or modify the Program subject to
these terms and conditions.  You may not impose any further
restrictions on the recipients' exercise of the rights granted herein.
You are not responsible for enforcing compliance by third parties to
this License.

  7. If, as a consequence of a court judgment or allegation of patent
infringement or for any other reason (not limited to patent issues),
conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License.  If you cannot
distribute so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you
may not distribute the Program at all.  For example, if a patent
license would not permit royalty-free redistribution of the Program by
all those who receive copies directly or indirectly through you, then
the only way you could satisfy both it and this License would be to
refrain entirely from distribution of the Program.

If any portion of this section is held invalid or unenforceable under
any particular circumstance, the balance of the section is intended to
apply and the section as a whole is intended to apply in other
circumstances.

It is not the purpose of this section to induce you to infringe any
patents or other property right claims or to contest validity of any
such claims; this section has the sole purpose of protecting the
integrity of the free software distribution system, which is
implemented by public license practices.  Many people have made
generous contributions to the wide range of software distributed
through that system in reliance on consistent application of that
system; it is up to the author/donor to decide if he or she is willing
to distribute software through any other system and a licensee cannot
impose that choice.

This section is intended to make thoroughly clear what is believed to
be a consequence of the rest of this License.

  8. If the distribution and/or use of the Program is restricted in
certain countries either by patents or by copyrighted interfaces, the
original copyright holder who places the Program under this License
may add an explicit geographical distribution limitation excluding
those countries, so that distribution is permitted only in or among
countries not thus excluded.  In such case, this License incorporates
the limitation as if written in the body of this License.

  9. The Free Software Foundation may publish revised and/or new versions
of the General Public License from time to time.  Such new versions will
be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.

Each version is given a distinguishing version number.  If the Program
specifies a version number of this License which applies to it and "any
later version", you have the option of following the terms and conditions
either of that version or of any later version published by the Free
Software Foundation.  If the Program does not specify a version number of
this License, you may choose any version ever published by the Free Software
Foundation.

  10. If you wish to incorporate parts of the Program into other free
programs whose distribution conditions are different, write to the author
to ask for permission.  For software which is copyrighted by the Free
Software Foundation, write to the Free Software Foundation; we sometimes
make exceptions for this.  Our decision will be guided by the two goals
of preserving the free status of all derivatives of our free software and
of promoting the sharing and reuse of software generally.

			    NO WARRANTY

  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
REPAIR OR CORRECTION.

  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
POSSIBILITY OF SUCH DAMAGES.

		     END OF TERMS AND CONDITIONS

	    How to Apply These Terms to Your New Programs

  If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these terms.

  To do so, attach the following notices to the program.  It is safest
to attach them to the start of each source file to most effectively
convey the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.

    <one line to give the program's name and a brief idea of what it does.>
    Copyright (C) <year>  <name of author>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA


Also add information on how to contact you by electronic and paper mail.

If the program is interactive, make it output a short notice like this
when it starts in an interactive mode:

    Gnomovision version 69, Copyright (C) year  name of author
    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
    This is free software, and you are welcome to redistribute it
    under certain conditions; type `show c' for details.

The hypothetical commands `show w' and `show c' should show the appropriate
parts of the General Public License.  Of course, the commands you use may
be called something other than `show w' and `show c'; they could even be
mouse-clicks or menu items--whatever suits your program.

You should also get your employer (if you work as a programmer) or your
school, if any, to sign a "copyright disclaimer" for the program, if
necessary.  Here is a sample; alter the names:

  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
  `Gnomovision' (which makes passes at compilers) written by James Hacker.

  <signature of Ty Coon>, 1 April 1989
  Ty Coon, President of Vice

This General Public License does not permit incorporating your program into
proprietary programs.  If your program is a subroutine library, you may
consider it more useful to permit linking proprietary applications with the
library.  If this is what you want to do, use the GNU Library General
Public License instead of this License.


================================================
FILE: stms/estm-0.3.0/src/atomic_ops/README
================================================
This directory contains a stripped-down (support only gcc) version of libatomic_ops by Hans Boehm.
The official release is available from http://www.hpl.hp.com/research/linux/atomic_ops/.


================================================
FILE: stms/estm-0.3.0/src/atomic_ops/aligned_atomic_load_store.h
================================================
/*
 * Copyright (c) 2003 Hewlett-Packard Development Company, L.P.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

/*
 * Definitions for architectures on which loads and stores of AO_t are
 * atomic fo all legal alignments.
 */

AO_INLINE AO_t
AO_load(const volatile AO_t *addr)
{
  assert(((size_t)addr & (sizeof(AO_t) - 1)) == 0);
  /* Cast away the volatile for architectures where             */
  /* volatile adds barrier semantics.                           */
  return *(AO_t *)addr;
}

#define AO_HAVE_load

AO_INLINE void
AO_store(volatile AO_t *addr, AO_t new_val)
{
  assert(((size_t)addr & (sizeof(AO_t) - 1)) == 0);
  (*(AO_t *)addr) = new_val;
}

#define AO_HAVE_store


================================================
FILE: stms/estm-0.3.0/src/atomic_ops/all_acquire_release_volatile.h
================================================
/*
 * Copyright (c) 2004 Hewlett-Packard Development Company, L.P.
 * 
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE. 
 */

/*
 * Describes architectures on which volatile AO_t, unsigned char, unsigned
 * short, and unsigned int loads and stores have acquire/release semantics for
 * all normally legal alignments.
 */
//#include "acquire_release_volatile.h"
//#include "char_acquire_release_volatile.h"
//#include "short_acquire_release_volatile.h"
//#include "int_acquire_release_volatile.h"

/*
 * This file adds definitions appropriate for environments in which an AO_t
 * volatile load has acquire semantics, and an AO_t volatile store has release
 * semantics.  This is arguably supposed to be true with the standard Itanium
 * software conventions.
 */

/*
 * Empirically gcc/ia64 does some reordering of ordinary operations around volatiles
 * even when we think it shouldn't.  Gcc 3.3 and earlier could reorder a volatile store
 * with another store.  As of March 2005, gcc pre-4 reused previously computed
 * common subexpressions across a volatile load.
 * Hence we now add compiler barriers for gcc.
 */
#if !defined(AO_GCC_BARRIER)
#  if defined(__GNUC__)
#    define AO_GCC_BARRIER() AO_compiler_barrier()
#  else
#    define AO_GCC_BARRIER()
#  endif
#endif

AO_INLINE AO_t
AO_load_acquire(const volatile AO_t *p)
{
  AO_t result = *p;
  /* A normal volatile load generates an ld.acq         */
  AO_GCC_BARRIER();
  return result;
}
#define AO_HAVE_load_acquire

AO_INLINE void
AO_store_release(volatile AO_t *p, AO_t val)
{
  AO_GCC_BARRIER();
  /* A normal volatile store generates an st.rel        */
  *p = val;
}
#define AO_HAVE_store_release

/*
 * This file adds definitions appropriate for environments in which an unsigned char
 * volatile load has acquire semantics, and an unsigned char volatile store has release
 * semantics.  This is true with the standard Itanium ABI.
 */
#if !defined(AO_GCC_BARRIER)
#  if defined(__GNUC__)
#    define AO_GCC_BARRIER() AO_compiler_barrier()
#  else
#    define AO_GCC_BARRIER()
#  endif
#endif

AO_INLINE unsigned char
AO_char_load_acquire(const volatile unsigned char *p)
{
  unsigned char result = *p;
  /* A normal volatile load generates an ld.acq         */
  AO_GCC_BARRIER();
  return result;
}
#define AO_HAVE_char_load_acquire

AO_INLINE void
AO_char_store_release(volatile unsigned char *p, unsigned char val)
{
  AO_GCC_BARRIER();
  /* A normal volatile store generates an st.rel        */
  *p = val;
}
#define AO_HAVE_char_store_release

/*
 * This file adds definitions appropriate for environments in which an unsigned short
 * volatile load has acquire semantics, and an unsigned short volatile store has release
 * semantics.  This is true with the standard Itanium ABI.
 */
#if !defined(AO_GCC_BARRIER)
#  if defined(__GNUC__)
#    define AO_GCC_BARRIER() AO_compiler_barrier()
#  else
#    define AO_GCC_BARRIER()
#  endif
#endif

AO_INLINE unsigned short
AO_short_load_acquire(const volatile unsigned short *p)
{
  unsigned short result = *p;
  /* A normal volatile load generates an ld.acq         */
  AO_GCC_BARRIER();
  return result;
}
#define AO_HAVE_short_load_acquire

AO_INLINE void
AO_short_store_release(volatile unsigned short *p, unsigned short val)
{
  AO_GCC_BARRIER();
  /* A normal volatile store generates an st.rel        */
  *p = val;
}
#define AO_HAVE_short_store_release

/*
 * This file adds definitions appropriate for environments in which an unsigned
 * int volatile load has acquire semantics, and an unsigned short volatile
 * store has release semantics.  This is true with the standard Itanium ABI.
 */
#if !defined(AO_GCC_BARRIER)
#  if defined(__GNUC__)
#    define AO_GCC_BARRIER() AO_compiler_barrier()
#  else
#    define AO_GCC_BARRIER()
#  endif
#endif

AO_INLINE unsigned int
AO_int_load_acquire(const volatile unsigned int *p)
{
  unsigned int result = *p;
  /* A normal volatile load generates an ld.acq         */
  AO_GCC_BARRIER();
  return result;
}
#define AO_HAVE_int_load_acquire

AO_INLINE void
AO_int_store_release(volatile unsigned int *p, unsigned int val)
{
  AO_GCC_BARRIER();
  /* A normal volatile store generates an st.rel        */
  *p = val;
}
#define AO_HAVE_int_store_release


================================================
FILE: stms/estm-0.3.0/src/atomic_ops/ao_t_is_int.h
================================================
/*
 * Copyright (c) 2003-2004 Hewlett-Packard Development Company, L.P.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

/*
 * Inclusion of this file signifies that AO_t is in fact int.  Hence
 * any AO_... operations can also server as AO_int_... operations.
 * We currently define only the more important ones here, and allow for
 * the normal generalization process to define the others.
 * We should probably add others in the future.
 */

#if defined(AO_HAVE_compare_and_swap_full) && \
    !defined(AO_HAVE_int_compare_and_swap_full)
#  define AO_int_compare_and_swap_full(addr, old, new_val) \
                AO_compare_and_swap_full((volatile AO_t *)(addr), \
                                        (AO_t)(old), (AO_t)(new_val))
#  define AO_HAVE_int_compare_and_swap_full
# endif

#if defined(AO_HAVE_compare_and_swap_acquire) && \
    !defined(AO_HAVE_int_compare_and_swap_acquire)
#  define AO_int_compare_and_swap_acquire(addr, old, new_val) \
                AO_compare_and_swap_acquire((volatile AO_t *)(addr), \
                                            (AO_t)(old), (AO_t)(new_val))
#  define AO_HAVE_int_compare_and_swap_acquire
# endif

#if defined(AO_HAVE_compare_and_swap_release) && \
    !defined(AO_HAVE_int_compare_and_swap_release)
#  define AO_int_compare_and_swap_release(addr, old, new_val) \
                AO_compare_and_swap_release((volatile AO_t *)(addr), \
                                         (AO_t)(old), (AO_t)(new_val))
#  define AO_HAVE_int_compare_and_swap_release
# endif

#if defined(AO_HAVE_compare_and_swap_write) && \
    !defined(AO_HAVE_int_compare_and_swap_write)
#  define AO_int_compare_and_swap_write(addr, old, new_val) \
                AO_compare_and_swap_write((volatile AO_t *)(addr), \
                                          (AO_t)(old), (AO_t)(new_val))
#  define AO_HAVE_int_compare_and_swap_write
# endif

#if defined(AO_HAVE_compare_and_swap_read) && \
    !defined(AO_HAVE_int_compare_and_swap_read)
#  define AO_int_compare_and_swap_read(addr, old, new_val) \
                AO_compare_and_swap_read((volatile AO_t *)(addr), \
                                         (AO_t)(old), (AO_t)(new_val))
#  define AO_HAVE_int_compare_and_swap_read
# endif

#if defined(AO_HAVE_compare_and_swap) && \
    !defined(AO_HAVE_int_compare_and_swap)
#  define AO_int_compare_and_swap(addr, old, new_val) \
                AO_compare_and_swap((volatile AO_t *)(addr), \
                                    (AO_t)(old), (AO_t)(new_val))
#  define AO_HAVE_int_compare_and_swap
# endif

#if defined(AO_HAVE_load_acquire) && \
    !defined(AO_HAVE_int_load_acquire)
#  define AO_int_load_acquire(addr) \
        (int)AO_load_acquire((const volatile AO_t *)(addr))
#  define AO_HAVE_int_load_acquire
# endif

#if defined(AO_HAVE_store_release) && \
    !defined(AO_HAVE_int_store_release)
#  define AO_int_store_release(addr, val) \
        AO_store_release((volatile AO_t *)(addr), (AO_t)(val))
#  define AO_HAVE_int_store_release
# endif

#if defined(AO_HAVE_fetch_and_add_full) && \
    !defined(AO_HAVE_int_fetch_and_add_full)
#  define AO_int_fetch_and_add_full(addr, incr) \
        (int)AO_fetch_and_add_full((volatile AO_t *)(addr), (AO_t)(incr))
#  define AO_HAVE_int_fetch_and_add_full
# endif

#if defined(AO_HAVE_fetch_and_add1_acquire) && \
    !defined(AO_HAVE_int_fetch_and_add1_acquire)
#  define AO_int_fetch_and_add1_acquire(addr) \
        (int)AO_fetch_and_add1_acquire((volatile AO_t *)(addr))
#  define AO_HAVE_int_fetch_and_add1_acquire
# endif

#if defined(AO_HAVE_fetch_and_add1_release) && \
    !defined(AO_HAVE_int_fetch_and_add1_release)
#  define AO_int_fetch_and_add1_release(addr) \
        (int)AO_fetch_and_add1_release((volatile AO_t *)(addr))
#  define AO_HAVE_int_fetch_and_add1_release
# endif

#if defined(AO_HAVE_fetch_and_sub1_acquire) && \
    !defined(AO_HAVE_int_fetch_and_sub1_acquire)
#  define AO_int_fetch_and_sub1_acquire(addr) \
        (int)AO_fetch_and_sub1_acquire((volatile AO_t *)(addr))
#  define AO_HAVE_int_fetch_and_sub1_acquire
# endif

#if defined(AO_HAVE_fetch_and_sub1_release) && \
    !defined(AO_HAVE_int_fetch_and_sub1_release)
#  define AO_int_fetch_and_sub1_release(addr) \
        (int)AO_fetch_and_sub1_release((volatile AO_t *)(addr))
#  define AO_HAVE_int_fetch_and_sub1_release
# endif


================================================
FILE: stms/estm-0.3.0/src/atomic_ops/atomic_ops.h
================================================
/*
 * Copyright (c) 2003 Hewlett-Packard Development Company, L.P.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#ifndef ATOMIC_OPS_H

#define ATOMIC_OPS_H

#include <assert.h>
#include <stddef.h>

/* We define various atomic operations on memory in a           */
/* machine-specific way.  Unfortunately, this is complicated    */
/* by the fact that these may or may not be combined with       */
/* various memory barriers.  Thus the actual operations we      */
/* define have the form AO_<atomic-op>_<barrier>, for all       */
/* plausible combinations of <atomic-op> and <barrier>.         */
/* This of course results in a mild combinatorial explosion.    */
/* To deal with it, we try to generate derived                  */
/* definitions for as many of the combinations as we can, as    */
/* automatically as possible.                                   */
/*                                                              */
/* Our assumption throughout is that the programmer will        */
/* specify the least demanding operation and memory barrier     */
/* that will guarantee correctness for the implementation.      */
/* Our job is to find the least expensive way to implement it   */
/* on the applicable hardware.  In many cases that will         */
/* involve, for example, a stronger memory barrier, or a        */
/* combination of hardware primitives.                          */
/*                                                              */
/* Conventions:                                                 */
/* "plain" atomic operations are not guaranteed to include      */
/* a barrier.  The suffix in the name specifies the barrier     */
/* type.  Suffixes are:                                         */
/* _release: Earlier operations may not be delayed past it.     */
/* _acquire: Later operations may not move ahead of it.         */
/* _read: Subsequent reads must follow this operation and       */
/*        preceding reads.                                      */
/* _write: Earlier writes precede both this operation and       */
/*        later writes.                                         */
/* _full: Ordered with respect to both earlier and later memops.*/
/* _release_write: Ordered with respect to earlier writes.      */
/* _acquire_read: Ordered with respect to later reads.          */
/*                                                              */
/* Currently we try to define the following atomic memory       */
/* operations, in combination with the above barriers:          */
/* AO_nop                                                       */
/* AO_load                                                      */
/* AO_store                                                     */
/* AO_test_and_set (binary)                                     */
/* AO_fetch_and_add                                             */
/* AO_fetch_and_add1                                            */
/* AO_fetch_and_sub1                                            */
/* AO_or                                                        */
/* AO_compare_and_swap                                          */
/*                                                              */
/* Note that atomicity guarantees are valid only if both        */
/* readers and writers use AO_ operations to access the         */
/* shared value, while ordering constraints are intended to     */
/* apply all memory operations.  If a location can potentially  */
/* be accessed simultaneously from multiple threads, and one of */
/* those accesses may be a write access, then all such          */
/* accesses to that location should be through AO_ primitives.  */
/* However if AO_ operations enforce sufficient ordering to     */
/* ensure that a location x cannot be accessed concurrently,    */
/* or can only be read concurrently, then x can be accessed     */
/* via ordinary references and assignments.                     */
/*                                                              */
/* Compare_and_exchange takes an address and an expected old    */
/* value and a new value, and returns an int.  Nonzero          */
/* indicates that it succeeded.                                 */
/* Test_and_set takes an address, atomically replaces it by     */
/* AO_TS_SET, and returns the prior value.                      */
/* An AO_TS_t location can be reset with the                    */
/* AO_CLEAR macro, which normally uses AO_store_release.        */
/* AO_fetch_and_add takes an address and an AO_t increment      */
/* value.  The AO_fetch_and_add1 and AO_fetch_and_sub1 variants */
/* are provided, since they allow faster implementations on     */
/* some hardware. AO_or atomically ors an AO_t value into a     */
/* memory location, but does not provide access to the original.*/
/*                                                              */
/* We expect this list to grow slowly over time.                */
/*                                                              */
/* Note that AO_nop_full is a full memory barrier.              */
/*                                                              */
/* Note that if some data is initialized with                   */
/*      data.x = ...; data.y = ...; ...                         */
/*      AO_store_release_write(&data_is_initialized, 1)         */
/* then data is guaranteed to be initialized after the test     */
/*      if (AO_load_release_read(&data_is_initialized)) ...     */
/* succeeds.  Furthermore, this should generate near-optimal    */
/* code on all common platforms.                                */
/*                                                              */
/* All operations operate on unsigned AO_t, which               */
/* is the natural word size, and usually unsigned long.         */
/* It is possible to check whether a particular operation op    */
/* is available on a particular platform by checking whether    */
/* AO_HAVE_op is defined.  We make heavy use of these macros    */
/* internally.                                                  */

/* The rest of this file basically has three sections:          */
/*                                                              */
/* Some utility and default definitions.                        */
/*                                                              */
/* The architecture dependent section:                          */
/* This defines atomic operations that have direct hardware     */
/* support on a particular platform, mostly by including the    */
/* appropriate compiler- and hardware-dependent file.           */
/*                                                              */
/* The synthesis section:                                       */
/* This tries to define other atomic operations in terms of     */
/* those that are explicitly available on the platform.         */
/* This section is hardware independent.                        */
/* We make no attempt to synthesize operations in ways that     */
/* effectively introduce locks, except for the debugging/demo   */
/* pthread-based implementation at the beginning.  A more       */
/* realistic implementation that falls back to locks could be   */
/* added as a higher layer.  But that would sacrifice           */
/* usability from signal handlers.                              */
/* The synthesis section is implemented almost entirely in      */
/* atomic_ops_generalize.h.                                     */

/* Some common defaults.  Overridden for some architectures.    */
#define AO_t size_t

/* The test_and_set primitive returns an AO_TS_VAL_t value.     */
/* AO_TS_t is the type of an in-memory test-and-set location.   */

#define AO_TS_INITIALIZER (AO_t)AO_TS_CLEAR

/* Platform-dependent stuff:                                    */
#if defined(__GNUC__) || defined(_MSC_VER) || defined(__INTEL_COMPILER) \
        || defined(__DMC__) || defined(__WATCOMC__)
# define AO_INLINE static __inline
#elif defined(__sun)
# define AO_INLINE static inline
#else
# define AO_INLINE static
#endif

#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
# define AO_compiler_barrier() __asm__ __volatile__("" : : : "memory")
#elif defined(_MSC_VER) || defined(__DMC__) || defined(__BORLANDC__) \
        || defined(__WATCOMC__)
# if defined(_AMD64_) || defined(_M_X64) || _MSC_VER >= 1400
#   if defined(_WIN32_WCE)
/* #     include <cmnintrin.h> */
#   elif defined(_MSC_VER)
#     include <intrin.h>
#   endif
#   pragma intrinsic(_ReadWriteBarrier)
#   define AO_compiler_barrier() _ReadWriteBarrier()
        /* We assume this does not generate a fence instruction.        */
        /* The documentation is a bit unclear.                          */
# else
#   define AO_compiler_barrier() __asm { }
        /* The preceding implementation may be preferable here too.     */
        /* But the documentation warns about VC++ 2003 and earlier.     */
# endif
#elif defined(__INTEL_COMPILER)
# define AO_compiler_barrier() __memory_barrier() /* Too strong? IA64-only? */
#elif defined(_HPUX_SOURCE)
# if defined(__ia64)
#   include <machine/sys/inline.h>
#   define AO_compiler_barrier() _Asm_sched_fence()
# else
    /* FIXME - We dont know how to do this.  This is a guess.   */
    /* And probably a bad one.                                  */
    static volatile int AO_barrier_dummy;
#   define AO_compiler_barrier() AO_barrier_dummy = AO_barrier_dummy
# endif
#else
  /* We conjecture that the following usually gives us the right        */
  /* semantics or an error.                                             */
# define AO_compiler_barrier() asm("")
#endif

#if defined(AO_USE_PTHREAD_DEFS)
# include "atomic_ops/sysdeps/generic_pthread.h"
#endif /* AO_USE_PTHREAD_DEFS */

#if defined(__GNUC__) && !defined(AO_USE_PTHREAD_DEFS) \
    && !defined(__INTEL_COMPILER)
# if defined(__i386__)
    /* We don't define AO_USE_SYNC_CAS_BUILTIN for x86 here because     */
    /* it might require specifying additional options (like -march)     */
    /* or additional link libraries (if -march is not specified).       */
#   include "./x86.h"
# endif /* __i386__ */
# if defined(__x86_64__)
#   if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2)
      /* It is safe to use __sync CAS built-in on this architecture.    */
#     define AO_USE_SYNC_CAS_BUILTIN
#   endif
#   include "./x86_64.h"
# endif /* __x86_64__ */
# if defined(__ia64__)
#   include "./ia64.h"
#   define AO_GENERALIZE_TWICE
# endif /* __ia64__ */
# if defined(__hppa__)
#   include "atomic_ops/sysdeps/gcc/hppa.h"
#   define AO_CAN_EMUL_CAS
# endif /* __hppa__ */
# if defined(__alpha__)
#   include "atomic_ops/sysdeps/gcc/alpha.h"
#   define AO_GENERALIZE_TWICE
# endif /* __alpha__ */
# if defined(__s390__)
#   include "atomic_ops/sysdeps/gcc/s390.h"
# endif /* __s390__ */
# if defined(__sparc__)
#   include "./sparc.h"
#   define AO_CAN_EMUL_CAS
# endif /* __sparc__ */
# if defined(__m68k__)
#   include "atomic_ops/sysdeps/gcc/m68k.h"
# endif /* __m68k__ */
# if defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) \
     || defined(__powerpc64__) || defined(__ppc64__)
#   include "./powerpc.h"
# endif /* __powerpc__ */
# if defined(__arm__) && !defined(AO_USE_PTHREAD_DEFS)
#   include "atomic_ops/sysdeps/gcc/arm.h"
#   define AO_CAN_EMUL_CAS
# endif /* __arm__ */
# if defined(__cris__) || defined(CRIS)
#   include "atomic_ops/sysdeps/gcc/cris.h"
# endif
# if defined(__mips__)
#   include "atomic_ops/sysdeps/gcc/mips.h"
# endif /* __mips__ */
# if defined(__sh__) || defined(SH4)
#   include "atomic_ops/sysdeps/gcc/sh.h"
#   define AO_CAN_EMUL_CAS
# endif /* __sh__ */
#endif /* __GNUC__ && !AO_USE_PTHREAD_DEFS */

#if defined(__INTEL_COMPILER) && !defined(AO_USE_PTHREAD_DEFS)
# if defined(__ia64__)
#   include "./ia64.h"
#   define AO_GENERALIZE_TWICE
# endif
# if defined(__GNUC__)
    /* Intel Compiler in GCC compatible mode */
#   if defined(__i386__)
#     include "./x86.h"
#   endif /* __i386__ */
#   if defined(__x86_64__)
#     if __INTEL_COMPILER > 1110
#       define AO_USE_SYNC_CAS_BUILTIN
#     endif
#     include "./x86_64.h"
#   endif /* __x86_64__ */
# endif
#endif

#if defined(_HPUX_SOURCE) && !defined(__GNUC__) && !defined(AO_USE_PTHREAD_DEFS)
# if defined(__ia64)
#   include "atomic_ops/sysdeps/hpc/ia64.h"
#   define AO_GENERALIZE_TWICE
# else
#   include "atomic_ops/sysdeps/hpc/hppa.h"
#   define AO_CAN_EMUL_CAS
# endif
#endif

#if defined(__sun) && !defined(__GNUC__) && !defined(AO_USE_PTHREAD_DEFS)
  /* Note: use -DAO_USE_PTHREAD_DEFS if Sun CC does not handle inline asm. */
# if defined(__i386)
#   include "atomic_ops/sysdeps/sunc/x86.h"
# endif /* __i386 */
# if defined(__x86_64) || defined(__amd64)
#   include "atomic_ops/sysdeps/sunc/x86_64.h"
# endif /* __x86_64 */
#endif

#if !defined(__GNUC__) && (defined(sparc) || defined(__sparc)) \
    && !defined(AO_USE_PTHREAD_DEFS)
#   include "atomic_ops/sysdeps/sunc/sparc.h"
#   define AO_CAN_EMUL_CAS
#endif

#if defined(_MSC_VER) || defined(__DMC__) || defined(__BORLANDC__) \
        || (defined(__WATCOMC__) && defined(__NT__))
# if defined(_AMD64_) || defined(_M_X64)
#   include "atomic_ops/sysdeps/msftc/x86_64.h"
# elif defined(_M_IX86) || defined(x86)
#   include "atomic_ops/sysdeps/msftc/x86.h"
# elif defined(_M_ARM) || defined(ARM) || defined(_ARM_)
#   include "atomic_ops/sysdeps/msftc/arm.h"
# endif
#endif

#if defined(AO_REQUIRE_CAS) && !defined(AO_HAVE_compare_and_swap) \
    && !defined(AO_HAVE_compare_and_swap_full) \
    && !defined(AO_HAVE_compare_and_swap_acquire)
# if defined(AO_CAN_EMUL_CAS)
#   include "atomic_ops/sysdeps/emul_cas.h"
# else
#  error Cannot implement AO_compare_and_swap_full on this architecture.
# endif
#endif  /* AO_REQUIRE_CAS && !AO_HAVE_compare_and_swap ... */

/* The most common way to clear a test-and-set location         */
/* at the end of a critical section.                            */
#if AO_AO_TS_T && !defined(AO_CLEAR)
# define AO_CLEAR(addr) AO_store_release((AO_TS_t *)(addr), AO_TS_CLEAR)
#endif
#if AO_CHAR_TS_T && !defined(AO_CLEAR)
# define AO_CLEAR(addr) AO_char_store_release((AO_TS_t *)(addr), AO_TS_CLEAR)
#endif

/*
 * The generalization section.
 * Theoretically this should repeatedly include atomic_ops_generalize.h.
 * In fact, we observe that this converges after a small fixed number
 * of iterations, usually one.
 */
#include "./generalize.h"
#ifdef AO_GENERALIZE_TWICE
# include "./generalize.h"
#endif

/* For compatibility with version 0.4 and earlier       */
#define AO_TS_T AO_TS_t
#define AO_T AO_t
#define AO_TS_VAL AO_TS_VAL_t

#endif /* ATOMIC_OPS_H */


================================================
FILE: stms/estm-0.3.0/src/atomic_ops/generalize-small.h
================================================
/* char_load */
#if defined(AO_HAVE_char_load_acquire) && !defined(AO_HAVE_char_load)
#  define AO_char_load(addr) AO_char_load_acquire(addr)
#  define AO_HAVE_char_load
#endif

#if defined(AO_HAVE_char_load_full) && !defined(AO_HAVE_char_load_acquire)
#  define AO_char_load_acquire(addr) AO_char_load_full(addr)
#  define AO_HAVE_char_load_acquire
#endif

#if defined(AO_HAVE_char_load_full) && !defined(AO_HAVE_char_load_read)
#  define AO_char_load_read(addr) AO_char_load_full(addr)
#  define AO_HAVE_char_load_read
#endif

#if !defined(AO_HAVE_char_load_acquire_read) && defined(AO_HAVE_char_load_acquire)
#  define AO_char_load_acquire_read(addr) AO_char_load_acquire(addr)
#  define AO_HAVE_char_load_acquire_read
#endif

#if defined(AO_HAVE_char_load) && defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_char_load_acquire)
   AO_INLINE unsigned char
   AO_char_load_acquire(const volatile unsigned char *addr)
   {
     unsigned char result = AO_char_load(addr);
     /* Acquire barrier would be useless, since the load could be delayed  */
     /* beyond it.                                                         */
     AO_nop_full();
     return result;
   }
#  define AO_HAVE_char_load_acquire
#endif

#if defined(AO_HAVE_char_load) && defined(AO_HAVE_nop_read) && \
    !defined(AO_HAVE_char_load_read)
   AO_INLINE unsigned char
   AO_char_load_read(const volatile unsigned char *addr)
   {
     unsigned char result = AO_char_load(addr);
     /* Acquire barrier would be useless, since the load could be delayed  */
     /* beyond it.                                                         */
     AO_nop_read();
     return result;
   }
#  define AO_HAVE_char_load_read
#endif

#if defined(AO_HAVE_char_load_acquire) && defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_char_load_full)
#  define AO_char_load_full(addr) (AO_nop_full(), AO_char_load_acquire(addr))
#  define AO_HAVE_char_load_full
#endif

#if !defined(AO_HAVE_char_load_acquire_read) && defined(AO_HAVE_char_load_read)
#  define AO_char_load_acquire_read(addr) AO_char_load_read(addr)
#  define AO_HAVE_char_load_acquire_read
#endif

#if defined(AO_HAVE_char_load_acquire_read) && !defined(AO_HAVE_char_load)
#  define AO_char_load(addr) AO_char_load_acquire_read(addr)
#  define AO_HAVE_char_load
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_char_load_acquire_read)
#    define AO_char_load_dd_acquire_read(addr) \
        AO_char_load_acquire_read(addr)
#    define AO_HAVE_char_load_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_char_load)
#    define AO_char_load_dd_acquire_read(addr) \
        AO_char_load(addr)
#    define AO_HAVE_char_load_dd_acquire_read
#  endif
#endif


/* char_store */

#if defined(AO_HAVE_char_store_release) && !defined(AO_HAVE_char_store)
#  define AO_char_store(addr, val) AO_char_store_release(addr,val)
#  define AO_HAVE_char_store
#endif

#if defined(AO_HAVE_char_store_full) && !defined(AO_HAVE_char_store_release)
#  define AO_char_store_release(addr,val) AO_char_store_full(addr,val)
#  define AO_HAVE_char_store_release
#endif

#if defined(AO_HAVE_char_store_full) && !defined(AO_HAVE_char_store_write)
#  define AO_char_store_write(addr,val) AO_char_store_full(addr,val)
#  define AO_HAVE_char_store_write
#endif

#if defined(AO_HAVE_char_store_release) && \
        !defined(AO_HAVE_char_store_release_write)
#  define AO_char_store_release_write(addr, val) \
        AO_char_store_release(addr,val)
#  define AO_HAVE_char_store_release_write
#endif

#if defined(AO_HAVE_char_store_write) && !defined(AO_HAVE_char_store)
#  define AO_char_store(addr, val) AO_char_store_write(addr,val)
#  define AO_HAVE_char_store
#endif

#if defined(AO_HAVE_char_store) && defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_char_store_release)
#  define AO_char_store_release(addr,val) \
        (AO_nop_full(), AO_char_store(addr,val))
#  define AO_HAVE_char_store_release
#endif

#if defined(AO_HAVE_nop_write) && defined(AO_HAVE_char_store) && \
     !defined(AO_HAVE_char_store_write)
#  define AO_char_store_write(addr, val) \
        (AO_nop_write(), AO_char_store(addr,val))
#  define AO_HAVE_char_store_write
#endif

#if defined(AO_HAVE_char_store_write) && \
     !defined(AO_HAVE_char_store_release_write)
#  define AO_char_store_release_write(addr, val) AO_char_store_write(addr,val)
#  define AO_HAVE_char_store_release_write
#endif

#if defined(AO_HAVE_char_store_release) && defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_char_store_full)
#  define AO_char_store_full(addr, val) \
        (AO_char_store_release(addr, val), AO_nop_full())
#  define AO_HAVE_char_store_full
#endif


/* char_fetch_and_add */
#if defined(AO_HAVE_char_compare_and_swap_full) && \
    !defined(AO_HAVE_char_fetch_and_add_full)
   AO_INLINE AO_t
   AO_char_fetch_and_add_full(volatile unsigned char *addr,
                               unsigned char incr)
   {
     unsigned char old;
     do
       {
         old = *addr;
       }
     while (!AO_char_compare_and_swap_full(addr, old, old+incr));
     return old;
   }
#  define AO_HAVE_char_fetch_and_add_full
#endif

#if defined(AO_HAVE_char_compare_and_swap_acquire) && \
    !defined(AO_HAVE_char_fetch_and_add_acquire)
   AO_INLINE AO_t
   AO_char_fetch_and_add_acquire(volatile unsigned char *addr,
                                  unsigned char incr)
   {
     unsigned char old;
     do
       {
         old = *addr;
       }
     while (!AO_char_compare_and_swap_acquire(addr, old, old+incr));
     return old;
   }
#  define AO_HAVE_char_fetch_and_add_acquire
#endif

#if defined(AO_HAVE_char_compare_and_swap_release) && \
    !defined(AO_HAVE_char_fetch_and_add_release)
   AO_INLINE AO_t
   AO_char_fetch_and_add_release(volatile unsigned char *addr,
                                  unsigned char incr)
   {
     unsigned char old;
     do
       {
         old = *addr;
       }
     while (!AO_char_compare_and_swap_release(addr, old, old+incr));
     return old;
   }
#  define AO_HAVE_char_fetch_and_add_release
#endif

#if defined(AO_HAVE_char_fetch_and_add_full)
#  if !defined(AO_HAVE_char_fetch_and_add_release)
#    define AO_char_fetch_and_add_release(addr, val) \
         AO_char_fetch_and_add_full(addr, val)
#    define AO_HAVE_char_fetch_and_add_release
#  endif
#  if !defined(AO_HAVE_char_fetch_and_add_acquire)
#    define AO_char_fetch_and_add_acquire(addr, val) \
         AO_char_fetch_and_add_full(addr, val)
#    define AO_HAVE_char_fetch_and_add_acquire
#  endif
#  if !defined(AO_HAVE_char_fetch_and_add_write)
#    define AO_char_fetch_and_add_write(addr, val) \
         AO_char_fetch_and_add_full(addr, val)
#    define AO_HAVE_char_fetch_and_add_write
#  endif
#  if !defined(AO_HAVE_char_fetch_and_add_read)
#    define AO_char_fetch_and_add_read(addr, val) \
         AO_char_fetch_and_add_full(addr, val)
#    define AO_HAVE_char_fetch_and_add_read
#  endif
#endif /* AO_HAVE_char_fetch_and_add_full */

#if !defined(AO_HAVE_char_fetch_and_add) && \
    defined(AO_HAVE_char_fetch_and_add_release)
#  define AO_char_fetch_and_add(addr, val) \
        AO_char_fetch_and_add_release(addr, val)
#  define AO_HAVE_char_fetch_and_add
#endif
#if !defined(AO_HAVE_char_fetch_and_add) && \
    defined(AO_HAVE_char_fetch_and_add_acquire)
#  define AO_char_fetch_and_add(addr, val) \
        AO_char_fetch_and_add_acquire(addr, val)
#  define AO_HAVE_char_fetch_and_add
#endif
#if !defined(AO_HAVE_char_fetch_and_add) && \
    defined(AO_HAVE_char_fetch_and_add_write)
#  define AO_char_fetch_and_add(addr, val) \
        AO_char_fetch_and_add_write(addr, val)
#  define AO_HAVE_char_fetch_and_add
#endif
#if !defined(AO_HAVE_char_fetch_and_add) && \
    defined(AO_HAVE_char_fetch_and_add_read)
#  define AO_char_fetch_and_add(addr, val) \
        AO_char_fetch_and_add_read(addr, val)
#  define AO_HAVE_char_fetch_and_add
#endif

#if defined(AO_HAVE_char_fetch_and_add_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_char_fetch_and_add_full)
#  define AO_char_fetch_and_add_full(addr, val) \
        (AO_nop_full(), AO_char_fetch_and_add_acquire(addr, val))
#endif

#if !defined(AO_HAVE_char_fetch_and_add_release_write) && \
    defined(AO_HAVE_char_fetch_and_add_write)
#  define AO_char_fetch_and_add_release_write(addr, val) \
        AO_char_fetch_and_add_write(addr, val)
#  define AO_HAVE_char_fetch_and_add_release_write
#endif
#if !defined(AO_HAVE_char_fetch_and_add_release_write) && \
    defined(AO_HAVE_char_fetch_and_add_release)
#  define AO_char_fetch_and_add_release_write(addr, val) \
        AO_char_fetch_and_add_release(addr, val)
#  define AO_HAVE_char_fetch_and_add_release_write
#endif
#if !defined(AO_HAVE_char_fetch_and_add_acquire_read) && \
    defined(AO_HAVE_char_fetch_and_add_read)
#  define AO_char_fetch_and_add_acquire_read(addr, val) \
        AO_char_fetch_and_add_read(addr, val)
#  define AO_HAVE_char_fetch_and_add_acquire_read
#endif
#if !defined(AO_HAVE_char_fetch_and_add_acquire_read) && \
    defined(AO_HAVE_char_fetch_and_add_acquire)
#  define AO_char_fetch_and_add_acquire_read(addr, val) \
        AO_char_fetch_and_add_acquire(addr, val)
#  define AO_HAVE_char_fetch_and_add_acquire_read
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_char_fetch_and_add_acquire_read)
#    define AO_char_fetch_and_add_dd_acquire_read(addr, val) \
        AO_char_fetch_and_add_acquire_read(addr, val)
#    define AO_HAVE_char_fetch_and_add_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_char_fetch_and_add)
#    define AO_char_fetch_and_add_dd_acquire_read(addr, val) \
        AO_char_fetch_and_add(addr, val)
#    define AO_HAVE_char_fetch_and_add_dd_acquire_read
#  endif
#endif

/* char_fetch_and_add1 */

#if defined(AO_HAVE_char_fetch_and_add_full) &&\
    !defined(AO_HAVE_char_fetch_and_add1_full)
#  define AO_char_fetch_and_add1_full(addr) \
        AO_char_fetch_and_add_full(addr,1)
#  define AO_HAVE_char_fetch_and_add1_full
#endif
#if defined(AO_HAVE_char_fetch_and_add_release) &&\
    !defined(AO_HAVE_char_fetch_and_add1_release)
#  define AO_char_fetch_and_add1_release(addr) \
        AO_char_fetch_and_add_release(addr,1)
#  define AO_HAVE_char_fetch_and_add1_release
#endif
#if defined(AO_HAVE_char_fetch_and_add_acquire) &&\
    !defined(AO_HAVE_char_fetch_and_add1_acquire)
#  define AO_char_fetch_and_add1_acquire(addr) \
        AO_char_fetch_and_add_acquire(addr,1)
#  define AO_HAVE_char_fetch_and_add1_acquire
#endif
#if defined(AO_HAVE_char_fetch_and_add_write) &&\
    !defined(AO_HAVE_char_fetch_and_add1_write)
#  define AO_char_fetch_and_add1_write(addr) \
        AO_char_fetch_and_add_write(addr,1)
#  define AO_HAVE_char_fetch_and_add1_write
#endif
#if defined(AO_HAVE_char_fetch_and_add_read) &&\
    !defined(AO_HAVE_char_fetch_and_add1_read)
#  define AO_char_fetch_and_add1_read(addr) \
        AO_char_fetch_and_add_read(addr,1)
#  define AO_HAVE_char_fetch_and_add1_read
#endif
#if defined(AO_HAVE_char_fetch_and_add_release_write) &&\
    !defined(AO_HAVE_char_fetch_and_add1_release_write)
#  define AO_char_fetch_and_add1_release_write(addr) \
        AO_char_fetch_and_add_release_write(addr,1)
#  define AO_HAVE_char_fetch_and_add1_release_write
#endif
#if defined(AO_HAVE_char_fetch_and_add_acquire_read) &&\
    !defined(AO_HAVE_char_fetch_and_add1_acquire_read)
#  define AO_char_fetch_and_add1_acquire_read(addr) \
        AO_char_fetch_and_add_acquire_read(addr,1)
#  define AO_HAVE_char_fetch_and_add1_acquire_read
#endif
#if defined(AO_HAVE_char_fetch_and_add) &&\
    !defined(AO_HAVE_char_fetch_and_add1)
#  define AO_char_fetch_and_add1(addr) \
        AO_char_fetch_and_add(addr,1)
#  define AO_HAVE_char_fetch_and_add1
#endif

#if defined(AO_HAVE_char_fetch_and_add1_full)
#  if !defined(AO_HAVE_char_fetch_and_add1_release)
#    define AO_char_fetch_and_add1_release(addr) \
         AO_char_fetch_and_add1_full(addr)
#    define AO_HAVE_char_fetch_and_add1_release
#  endif
#  if !defined(AO_HAVE_char_fetch_and_add1_acquire)
#    define AO_char_fetch_and_add1_acquire(addr) \
         AO_char_fetch_and_add1_full(addr)
#    define AO_HAVE_char_fetch_and_add1_acquire
#  endif
#  if !defined(AO_HAVE_char_fetch_and_add1_write)
#    define AO_char_fetch_and_add1_write(addr) \
         AO_char_fetch_and_add1_full(addr)
#    define AO_HAVE_char_fetch_and_add1_write
#  endif
#  if !defined(AO_HAVE_char_fetch_and_add1_read)
#    define AO_char_fetch_and_add1_read(addr) \
         AO_char_fetch_and_add1_full(addr)
#    define AO_HAVE_char_fetch_and_add1_read
#  endif
#endif /* AO_HAVE_char_fetch_and_add1_full */

#if !defined(AO_HAVE_char_fetch_and_add1) && \
    defined(AO_HAVE_char_fetch_and_add1_release)
#  define AO_char_fetch_and_add1(addr) \
        AO_char_fetch_and_add1_release(addr)
#  define AO_HAVE_char_fetch_and_add1
#endif
#if !defined(AO_HAVE_char_fetch_and_add1) && \
    defined(AO_HAVE_char_fetch_and_add1_acquire)
#  define AO_char_fetch_and_add1(addr) \
        AO_char_fetch_and_add1_acquire(addr)
#  define AO_HAVE_char_fetch_and_add1
#endif
#if !defined(AO_HAVE_char_fetch_and_add1) && \
    defined(AO_HAVE_char_fetch_and_add1_write)
#  define AO_char_fetch_and_add1(addr) \
        AO_char_fetch_and_add1_write(addr)
#  define AO_HAVE_char_fetch_and_add1
#endif
#if !defined(AO_HAVE_char_fetch_and_add1) && \
    defined(AO_HAVE_char_fetch_and_add1_read)
#  define AO_char_fetch_and_add1(addr) \
        AO_char_fetch_and_add1_read(addr)
#  define AO_HAVE_char_fetch_and_add1
#endif

#if defined(AO_HAVE_char_fetch_and_add1_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_char_fetch_and_add1_full)
#  define AO_char_fetch_and_add1_full(addr) \
        (AO_nop_full(), AO_char_fetch_and_add1_acquire(addr))
#  define AO_HAVE_char_fetch_and_add1_full
#endif

#if !defined(AO_HAVE_char_fetch_and_add1_release_write) && \
    defined(AO_HAVE_char_fetch_and_add1_write)
#  define AO_char_fetch_and_add1_release_write(addr) \
        AO_char_fetch_and_add1_write(addr)
#  define AO_HAVE_char_fetch_and_add1_release_write
#endif
#if !defined(AO_HAVE_char_fetch_and_add1_release_write) && \
    defined(AO_HAVE_char_fetch_and_add1_release)
#  define AO_char_fetch_and_add1_release_write(addr) \
        AO_char_fetch_and_add1_release(addr)
#  define AO_HAVE_char_fetch_and_add1_release_write
#endif
#if !defined(AO_HAVE_char_fetch_and_add1_acquire_read) && \
    defined(AO_HAVE_char_fetch_and_add1_read)
#  define AO_char_fetch_and_add1_acquire_read(addr) \
        AO_char_fetch_and_add1_read(addr)
#  define AO_HAVE_char_fetch_and_add1_acquire_read
#endif
#if !defined(AO_HAVE_char_fetch_and_add1_acquire_read) && \
    defined(AO_HAVE_char_fetch_and_add1_acquire)
#  define AO_char_fetch_and_add1_acquire_read(addr) \
        AO_char_fetch_and_add1_acquire(addr)
#  define AO_HAVE_char_fetch_and_add1_acquire_read
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_char_fetch_and_add1_acquire_read)
#    define AO_char_fetch_and_add1_dd_acquire_read(addr) \
        AO_char_fetch_and_add1_acquire_read(addr)
#    define AO_HAVE_char_fetch_and_add1_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_char_fetch_and_add1)
#    define AO_char_fetch_and_add1_dd_acquire_read(addr) \
        AO_char_fetch_and_add1(addr)
#    define AO_HAVE_char_fetch_and_add1_dd_acquire_read
#  endif
#endif

/* char_fetch_and_sub1 */

#if defined(AO_HAVE_char_fetch_and_add_full) &&\
    !defined(AO_HAVE_char_fetch_and_sub1_full)
#  define AO_char_fetch_and_sub1_full(addr) \
        AO_char_fetch_and_add_full(addr,(unsigned char)(-1))
#  define AO_HAVE_char_fetch_and_sub1_full
#endif
#if defined(AO_HAVE_char_fetch_and_add_release) &&\
    !defined(AO_HAVE_char_fetch_and_sub1_release)
#  define AO_char_fetch_and_sub1_release(addr) \
        AO_char_fetch_and_add_release(addr,(unsigned char)(-1))
#  define AO_HAVE_char_fetch_and_sub1_release
#endif
#if defined(AO_HAVE_char_fetch_and_add_acquire) &&\
    !defined(AO_HAVE_char_fetch_and_sub1_acquire)
#  define AO_char_fetch_and_sub1_acquire(addr) \
        AO_char_fetch_and_add_acquire(addr,(unsigned char)(-1))
#  define AO_HAVE_char_fetch_and_sub1_acquire
#endif
#if defined(AO_HAVE_char_fetch_and_add_write) &&\
    !defined(AO_HAVE_char_fetch_and_sub1_write)
#  define AO_char_fetch_and_sub1_write(addr) \
        AO_char_fetch_and_add_write(addr,(unsigned char)(-1))
#  define AO_HAVE_char_fetch_and_sub1_write
#endif
#if defined(AO_HAVE_char_fetch_and_add_read) &&\
    !defined(AO_HAVE_char_fetch_and_sub1_read)
#  define AO_char_fetch_and_sub1_read(addr) \
        AO_char_fetch_and_add_read(addr,(unsigned char)(-1))
#  define AO_HAVE_char_fetch_and_sub1_read
#endif
#if defined(AO_HAVE_char_fetch_and_add_release_write) &&\
    !defined(AO_HAVE_char_fetch_and_sub1_release_write)
#  define AO_char_fetch_and_sub1_release_write(addr) \
        AO_char_fetch_and_add_release_write(addr,(unsigned char)(-1))
#  define AO_HAVE_char_fetch_and_sub1_release_write
#endif
#if defined(AO_HAVE_char_fetch_and_add_acquire_read) &&\
    !defined(AO_HAVE_char_fetch_and_sub1_acquire_read)
#  define AO_char_fetch_and_sub1_acquire_read(addr) \
        AO_char_fetch_and_add_acquire_read(addr,(unsigned char)(-1))
#  define AO_HAVE_char_fetch_and_sub1_acquire_read
#endif
#if defined(AO_HAVE_char_fetch_and_add) &&\
    !defined(AO_HAVE_char_fetch_and_sub1)
#  define AO_char_fetch_and_sub1(addr) \
        AO_char_fetch_and_add(addr,(unsigned char)(-1))
#  define AO_HAVE_char_fetch_and_sub1
#endif

#if defined(AO_HAVE_char_fetch_and_sub1_full)
#  if !defined(AO_HAVE_char_fetch_and_sub1_release)
#    define AO_char_fetch_and_sub1_release(addr) \
         AO_char_fetch_and_sub1_full(addr)
#    define AO_HAVE_char_fetch_and_sub1_release
#  endif
#  if !defined(AO_HAVE_char_fetch_and_sub1_acquire)
#    define AO_char_fetch_and_sub1_acquire(addr) \
         AO_char_fetch_and_sub1_full(addr)
#    define AO_HAVE_char_fetch_and_sub1_acquire
#  endif
#  if !defined(AO_HAVE_char_fetch_and_sub1_write)
#    define AO_char_fetch_and_sub1_write(addr) \
         AO_char_fetch_and_sub1_full(addr)
#    define AO_HAVE_char_fetch_and_sub1_write
#  endif
#  if !defined(AO_HAVE_char_fetch_and_sub1_read)
#    define AO_char_fetch_and_sub1_read(addr) \
         AO_char_fetch_and_sub1_full(addr)
#    define AO_HAVE_char_fetch_and_sub1_read
#  endif
#endif /* AO_HAVE_char_fetch_and_sub1_full */

#if !defined(AO_HAVE_char_fetch_and_sub1) && \
    defined(AO_HAVE_char_fetch_and_sub1_release)
#  define AO_char_fetch_and_sub1(addr) \
        AO_char_fetch_and_sub1_release(addr)
#  define AO_HAVE_char_fetch_and_sub1
#endif
#if !defined(AO_HAVE_char_fetch_and_sub1) && \
    defined(AO_HAVE_char_fetch_and_sub1_acquire)
#  define AO_char_fetch_and_sub1(addr) \
        AO_char_fetch_and_sub1_acquire(addr)
#  define AO_HAVE_char_fetch_and_sub1
#endif
#if !defined(AO_HAVE_char_fetch_and_sub1) && \
    defined(AO_HAVE_char_fetch_and_sub1_write)
#  define AO_char_fetch_and_sub1(addr) \
        AO_char_fetch_and_sub1_write(addr)
#  define AO_HAVE_char_fetch_and_sub1
#endif
#if !defined(AO_HAVE_char_fetch_and_sub1) && \
    defined(AO_HAVE_char_fetch_and_sub1_read)
#  define AO_char_fetch_and_sub1(addr) \
        AO_char_fetch_and_sub1_read(addr)
#  define AO_HAVE_char_fetch_and_sub1
#endif

#if defined(AO_HAVE_char_fetch_and_sub1_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_char_fetch_and_sub1_full)
#  define AO_char_fetch_and_sub1_full(addr) \
        (AO_nop_full(), AO_char_fetch_and_sub1_acquire(addr))
#  define AO_HAVE_char_fetch_and_sub1_full
#endif

#if !defined(AO_HAVE_char_fetch_and_sub1_release_write) && \
    defined(AO_HAVE_char_fetch_and_sub1_write)
#  define AO_char_fetch_and_sub1_release_write(addr) \
        AO_char_fetch_and_sub1_write(addr)
#  define AO_HAVE_char_fetch_and_sub1_release_write
#endif
#if !defined(AO_HAVE_char_fetch_and_sub1_release_write) && \
    defined(AO_HAVE_char_fetch_and_sub1_release)
#  define AO_char_fetch_and_sub1_release_write(addr) \
        AO_char_fetch_and_sub1_release(addr)
#  define AO_HAVE_char_fetch_and_sub1_release_write
#endif
#if !defined(AO_HAVE_char_fetch_and_sub1_acquire_read) && \
    defined(AO_HAVE_char_fetch_and_sub1_read)
#  define AO_char_fetch_and_sub1_acquire_read(addr) \
        AO_char_fetch_and_sub1_read(addr)
#  define AO_HAVE_char_fetch_and_sub1_acquire_read
#endif
#if !defined(AO_HAVE_char_fetch_and_sub1_acquire_read) && \
    defined(AO_HAVE_char_fetch_and_sub1_acquire)
#  define AO_char_fetch_and_sub1_acquire_read(addr) \
        AO_char_fetch_and_sub1_acquire(addr)
#  define AO_HAVE_char_fetch_and_sub1_acquire_read
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_char_fetch_and_sub1_acquire_read)
#    define AO_char_fetch_and_sub1_dd_acquire_read(addr) \
        AO_char_fetch_and_sub1_acquire_read(addr)
#    define AO_HAVE_char_fetch_and_sub1_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_char_fetch_and_sub1)
#    define AO_char_fetch_and_sub1_dd_acquire_read(addr) \
        AO_char_fetch_and_sub1(addr)
#    define AO_HAVE_char_fetch_and_sub1_dd_acquire_read
#  endif
#endif

/* short_load */
#if defined(AO_HAVE_short_load_acquire) && !defined(AO_HAVE_short_load)
#  define AO_short_load(addr) AO_short_load_acquire(addr)
#  define AO_HAVE_short_load
#endif

#if defined(AO_HAVE_short_load_full) && !defined(AO_HAVE_short_load_acquire)
#  define AO_short_load_acquire(addr) AO_short_load_full(addr)
#  define AO_HAVE_short_load_acquire
#endif

#if defined(AO_HAVE_short_load_full) && !defined(AO_HAVE_short_load_read)
#  define AO_short_load_read(addr) AO_short_load_full(addr)
#  define AO_HAVE_short_load_read
#endif

#if !defined(AO_HAVE_short_load_acquire_read) && defined(AO_HAVE_short_load_acquire)
#  define AO_short_load_acquire_read(addr) AO_short_load_acquire(addr)
#  define AO_HAVE_short_load_acquire_read
#endif

#if defined(AO_HAVE_short_load) && defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_short_load_acquire)
   AO_INLINE unsigned short
   AO_short_load_acquire(const volatile unsigned short *addr)
   {
     unsigned short result = AO_short_load(addr);
     /* Acquire barrier would be useless, since the load could be delayed  */
     /* beyond it.                                                         */
     AO_nop_full();
     return result;
   }
#  define AO_HAVE_short_load_acquire
#endif

#if defined(AO_HAVE_short_load) && defined(AO_HAVE_nop_read) && \
    !defined(AO_HAVE_short_load_read)
   AO_INLINE unsigned short
   AO_short_load_read(const volatile unsigned short *addr)
   {
     unsigned short result = AO_short_load(addr);
     /* Acquire barrier would be useless, since the load could be delayed  */
     /* beyond it.                                                         */
     AO_nop_read();
     return result;
   }
#  define AO_HAVE_short_load_read
#endif

#if defined(AO_HAVE_short_load_acquire) && defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_short_load_full)
#  define AO_short_load_full(addr) (AO_nop_full(), AO_short_load_acquire(addr))
#  define AO_HAVE_short_load_full
#endif

#if !defined(AO_HAVE_short_load_acquire_read) && defined(AO_HAVE_short_load_read)
#  define AO_short_load_acquire_read(addr) AO_short_load_read(addr)
#  define AO_HAVE_short_load_acquire_read
#endif

#if defined(AO_HAVE_short_load_acquire_read) && !defined(AO_HAVE_short_load)
#  define AO_short_load(addr) AO_short_load_acquire_read(addr)
#  define AO_HAVE_short_load
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_short_load_acquire_read)
#    define AO_short_load_dd_acquire_read(addr) \
        AO_short_load_acquire_read(addr)
#    define AO_HAVE_short_load_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_short_load)
#    define AO_short_load_dd_acquire_read(addr) \
        AO_short_load(addr)
#    define AO_HAVE_short_load_dd_acquire_read
#  endif
#endif


/* short_store */

#if defined(AO_HAVE_short_store_release) && !defined(AO_HAVE_short_store)
#  define AO_short_store(addr, val) AO_short_store_release(addr,val)
#  define AO_HAVE_short_store
#endif

#if defined(AO_HAVE_short_store_full) && !defined(AO_HAVE_short_store_release)
#  define AO_short_store_release(addr,val) AO_short_store_full(addr,val)
#  define AO_HAVE_short_store_release
#endif

#if defined(AO_HAVE_short_store_full) && !defined(AO_HAVE_short_store_write)
#  define AO_short_store_write(addr,val) AO_short_store_full(addr,val)
#  define AO_HAVE_short_store_write
#endif

#if defined(AO_HAVE_short_store_release) && \
        !defined(AO_HAVE_short_store_release_write)
#  define AO_short_store_release_write(addr, val) \
        AO_short_store_release(addr,val)
#  define AO_HAVE_short_store_release_write
#endif

#if defined(AO_HAVE_short_store_write) && !defined(AO_HAVE_short_store)
#  define AO_short_store(addr, val) AO_short_store_write(addr,val)
#  define AO_HAVE_short_store
#endif

#if defined(AO_HAVE_short_store) && defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_short_store_release)
#  define AO_short_store_release(addr,val) \
        (AO_nop_full(), AO_short_store(addr,val))
#  define AO_HAVE_short_store_release
#endif

#if defined(AO_HAVE_nop_write) && defined(AO_HAVE_short_store) && \
     !defined(AO_HAVE_short_store_write)
#  define AO_short_store_write(addr, val) \
        (AO_nop_write(), AO_short_store(addr,val))
#  define AO_HAVE_short_store_write
#endif

#if defined(AO_HAVE_short_store_write) && \
     !defined(AO_HAVE_short_store_release_write)
#  define AO_short_store_release_write(addr, val) AO_short_store_write(addr,val)
#  define AO_HAVE_short_store_release_write
#endif

#if defined(AO_HAVE_short_store_release) && defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_short_store_full)
#  define AO_short_store_full(addr, val) \
        (AO_short_store_release(addr, val), AO_nop_full())
#  define AO_HAVE_short_store_full
#endif


/* short_fetch_and_add */
#if defined(AO_HAVE_short_compare_and_swap_full) && \
    !defined(AO_HAVE_short_fetch_and_add_full)
   AO_INLINE AO_t
   AO_short_fetch_and_add_full(volatile unsigned short *addr,
                               unsigned short incr)
   {
     unsigned short old;
     do
       {
         old = *addr;
       }
     while (!AO_short_compare_and_swap_full(addr, old, old+incr));
     return old;
   }
#  define AO_HAVE_short_fetch_and_add_full
#endif

#if defined(AO_HAVE_short_compare_and_swap_acquire) && \
    !defined(AO_HAVE_short_fetch_and_add_acquire)
   AO_INLINE AO_t
   AO_short_fetch_and_add_acquire(volatile unsigned short *addr,
                                  unsigned short incr)
   {
     unsigned short old;
     do
       {
         old = *addr;
       }
     while (!AO_short_compare_and_swap_acquire(addr, old, old+incr));
     return old;
   }
#  define AO_HAVE_short_fetch_and_add_acquire
#endif

#if defined(AO_HAVE_short_compare_and_swap_release) && \
    !defined(AO_HAVE_short_fetch_and_add_release)
   AO_INLINE AO_t
   AO_short_fetch_and_add_release(volatile unsigned short *addr,
                                  unsigned short incr)
   {
     unsigned short old;
     do
       {
         old = *addr;
       }
     while (!AO_short_compare_and_swap_release(addr, old, old+incr));
     return old;
   }
#  define AO_HAVE_short_fetch_and_add_release
#endif

#if defined(AO_HAVE_short_fetch_and_add_full)
#  if !defined(AO_HAVE_short_fetch_and_add_release)
#    define AO_short_fetch_and_add_release(addr, val) \
         AO_short_fetch_and_add_full(addr, val)
#    define AO_HAVE_short_fetch_and_add_release
#  endif
#  if !defined(AO_HAVE_short_fetch_and_add_acquire)
#    define AO_short_fetch_and_add_acquire(addr, val) \
         AO_short_fetch_and_add_full(addr, val)
#    define AO_HAVE_short_fetch_and_add_acquire
#  endif
#  if !defined(AO_HAVE_short_fetch_and_add_write)
#    define AO_short_fetch_and_add_write(addr, val) \
         AO_short_fetch_and_add_full(addr, val)
#    define AO_HAVE_short_fetch_and_add_write
#  endif
#  if !defined(AO_HAVE_short_fetch_and_add_read)
#    define AO_short_fetch_and_add_read(addr, val) \
         AO_short_fetch_and_add_full(addr, val)
#    define AO_HAVE_short_fetch_and_add_read
#  endif
#endif /* AO_HAVE_short_fetch_and_add_full */

#if !defined(AO_HAVE_short_fetch_and_add) && \
    defined(AO_HAVE_short_fetch_and_add_release)
#  define AO_short_fetch_and_add(addr, val) \
        AO_short_fetch_and_add_release(addr, val)
#  define AO_HAVE_short_fetch_and_add
#endif
#if !defined(AO_HAVE_short_fetch_and_add) && \
    defined(AO_HAVE_short_fetch_and_add_acquire)
#  define AO_short_fetch_and_add(addr, val) \
        AO_short_fetch_and_add_acquire(addr, val)
#  define AO_HAVE_short_fetch_and_add
#endif
#if !defined(AO_HAVE_short_fetch_and_add) && \
    defined(AO_HAVE_short_fetch_and_add_write)
#  define AO_short_fetch_and_add(addr, val) \
        AO_short_fetch_and_add_write(addr, val)
#  define AO_HAVE_short_fetch_and_add
#endif
#if !defined(AO_HAVE_short_fetch_and_add) && \
    defined(AO_HAVE_short_fetch_and_add_read)
#  define AO_short_fetch_and_add(addr, val) \
        AO_short_fetch_and_add_read(addr, val)
#  define AO_HAVE_short_fetch_and_add
#endif

#if defined(AO_HAVE_short_fetch_and_add_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_short_fetch_and_add_full)
#  define AO_short_fetch_and_add_full(addr, val) \
        (AO_nop_full(), AO_short_fetch_and_add_acquire(addr, val))
#endif

#if !defined(AO_HAVE_short_fetch_and_add_release_write) && \
    defined(AO_HAVE_short_fetch_and_add_write)
#  define AO_short_fetch_and_add_release_write(addr, val) \
        AO_short_fetch_and_add_write(addr, val)
#  define AO_HAVE_short_fetch_and_add_release_write
#endif
#if !defined(AO_HAVE_short_fetch_and_add_release_write) && \
    defined(AO_HAVE_short_fetch_and_add_release)
#  define AO_short_fetch_and_add_release_write(addr, val) \
        AO_short_fetch_and_add_release(addr, val)
#  define AO_HAVE_short_fetch_and_add_release_write
#endif
#if !defined(AO_HAVE_short_fetch_and_add_acquire_read) && \
    defined(AO_HAVE_short_fetch_and_add_read)
#  define AO_short_fetch_and_add_acquire_read(addr, val) \
        AO_short_fetch_and_add_read(addr, val)
#  define AO_HAVE_short_fetch_and_add_acquire_read
#endif
#if !defined(AO_HAVE_short_fetch_and_add_acquire_read) && \
    defined(AO_HAVE_short_fetch_and_add_acquire)
#  define AO_short_fetch_and_add_acquire_read(addr, val) \
        AO_short_fetch_and_add_acquire(addr, val)
#  define AO_HAVE_short_fetch_and_add_acquire_read
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_short_fetch_and_add_acquire_read)
#    define AO_short_fetch_and_add_dd_acquire_read(addr, val) \
        AO_short_fetch_and_add_acquire_read(addr, val)
#    define AO_HAVE_short_fetch_and_add_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_short_fetch_and_add)
#    define AO_short_fetch_and_add_dd_acquire_read(addr, val) \
        AO_short_fetch_and_add(addr, val)
#    define AO_HAVE_short_fetch_and_add_dd_acquire_read
#  endif
#endif

/* short_fetch_and_add1 */

#if defined(AO_HAVE_short_fetch_and_add_full) &&\
    !defined(AO_HAVE_short_fetch_and_add1_full)
#  define AO_short_fetch_and_add1_full(addr) \
        AO_short_fetch_and_add_full(addr,1)
#  define AO_HAVE_short_fetch_and_add1_full
#endif
#if defined(AO_HAVE_short_fetch_and_add_release) &&\
    !defined(AO_HAVE_short_fetch_and_add1_release)
#  define AO_short_fetch_and_add1_release(addr) \
        AO_short_fetch_and_add_release(addr,1)
#  define AO_HAVE_short_fetch_and_add1_release
#endif
#if defined(AO_HAVE_short_fetch_and_add_acquire) &&\
    !defined(AO_HAVE_short_fetch_and_add1_acquire)
#  define AO_short_fetch_and_add1_acquire(addr) \
        AO_short_fetch_and_add_acquire(addr,1)
#  define AO_HAVE_short_fetch_and_add1_acquire
#endif
#if defined(AO_HAVE_short_fetch_and_add_write) &&\
    !defined(AO_HAVE_short_fetch_and_add1_write)
#  define AO_short_fetch_and_add1_write(addr) \
        AO_short_fetch_and_add_write(addr,1)
#  define AO_HAVE_short_fetch_and_add1_write
#endif
#if defined(AO_HAVE_short_fetch_and_add_read) &&\
    !defined(AO_HAVE_short_fetch_and_add1_read)
#  define AO_short_fetch_and_add1_read(addr) \
        AO_short_fetch_and_add_read(addr,1)
#  define AO_HAVE_short_fetch_and_add1_read
#endif
#if defined(AO_HAVE_short_fetch_and_add_release_write) &&\
    !defined(AO_HAVE_short_fetch_and_add1_release_write)
#  define AO_short_fetch_and_add1_release_write(addr) \
        AO_short_fetch_and_add_release_write(addr,1)
#  define AO_HAVE_short_fetch_and_add1_release_write
#endif
#if defined(AO_HAVE_short_fetch_and_add_acquire_read) &&\
    !defined(AO_HAVE_short_fetch_and_add1_acquire_read)
#  define AO_short_fetch_and_add1_acquire_read(addr) \
        AO_short_fetch_and_add_acquire_read(addr,1)
#  define AO_HAVE_short_fetch_and_add1_acquire_read
#endif
#if defined(AO_HAVE_short_fetch_and_add) &&\
    !defined(AO_HAVE_short_fetch_and_add1)
#  define AO_short_fetch_and_add1(addr) \
        AO_short_fetch_and_add(addr,1)
#  define AO_HAVE_short_fetch_and_add1
#endif

#if defined(AO_HAVE_short_fetch_and_add1_full)
#  if !defined(AO_HAVE_short_fetch_and_add1_release)
#    define AO_short_fetch_and_add1_release(addr) \
         AO_short_fetch_and_add1_full(addr)
#    define AO_HAVE_short_fetch_and_add1_release
#  endif
#  if !defined(AO_HAVE_short_fetch_and_add1_acquire)
#    define AO_short_fetch_and_add1_acquire(addr) \
         AO_short_fetch_and_add1_full(addr)
#    define AO_HAVE_short_fetch_and_add1_acquire
#  endif
#  if !defined(AO_HAVE_short_fetch_and_add1_write)
#    define AO_short_fetch_and_add1_write(addr) \
         AO_short_fetch_and_add1_full(addr)
#    define AO_HAVE_short_fetch_and_add1_write
#  endif
#  if !defined(AO_HAVE_short_fetch_and_add1_read)
#    define AO_short_fetch_and_add1_read(addr) \
         AO_short_fetch_and_add1_full(addr)
#    define AO_HAVE_short_fetch_and_add1_read
#  endif
#endif /* AO_HAVE_short_fetch_and_add1_full */

#if !defined(AO_HAVE_short_fetch_and_add1) && \
    defined(AO_HAVE_short_fetch_and_add1_release)
#  define AO_short_fetch_and_add1(addr) \
        AO_short_fetch_and_add1_release(addr)
#  define AO_HAVE_short_fetch_and_add1
#endif
#if !defined(AO_HAVE_short_fetch_and_add1) && \
    defined(AO_HAVE_short_fetch_and_add1_acquire)
#  define AO_short_fetch_and_add1(addr) \
        AO_short_fetch_and_add1_acquire(addr)
#  define AO_HAVE_short_fetch_and_add1
#endif
#if !defined(AO_HAVE_short_fetch_and_add1) && \
    defined(AO_HAVE_short_fetch_and_add1_write)
#  define AO_short_fetch_and_add1(addr) \
        AO_short_fetch_and_add1_write(addr)
#  define AO_HAVE_short_fetch_and_add1
#endif
#if !defined(AO_HAVE_short_fetch_and_add1) && \
    defined(AO_HAVE_short_fetch_and_add1_read)
#  define AO_short_fetch_and_add1(addr) \
        AO_short_fetch_and_add1_read(addr)
#  define AO_HAVE_short_fetch_and_add1
#endif

#if defined(AO_HAVE_short_fetch_and_add1_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_short_fetch_and_add1_full)
#  define AO_short_fetch_and_add1_full(addr) \
        (AO_nop_full(), AO_short_fetch_and_add1_acquire(addr))
#  define AO_HAVE_short_fetch_and_add1_full
#endif

#if !defined(AO_HAVE_short_fetch_and_add1_release_write) && \
    defined(AO_HAVE_short_fetch_and_add1_write)
#  define AO_short_fetch_and_add1_release_write(addr) \
        AO_short_fetch_and_add1_write(addr)
#  define AO_HAVE_short_fetch_and_add1_release_write
#endif
#if !defined(AO_HAVE_short_fetch_and_add1_release_write) && \
    defined(AO_HAVE_short_fetch_and_add1_release)
#  define AO_short_fetch_and_add1_release_write(addr) \
        AO_short_fetch_and_add1_release(addr)
#  define AO_HAVE_short_fetch_and_add1_release_write
#endif
#if !defined(AO_HAVE_short_fetch_and_add1_acquire_read) && \
    defined(AO_HAVE_short_fetch_and_add1_read)
#  define AO_short_fetch_and_add1_acquire_read(addr) \
        AO_short_fetch_and_add1_read(addr)
#  define AO_HAVE_short_fetch_and_add1_acquire_read
#endif
#if !defined(AO_HAVE_short_fetch_and_add1_acquire_read) && \
    defined(AO_HAVE_short_fetch_and_add1_acquire)
#  define AO_short_fetch_and_add1_acquire_read(addr) \
        AO_short_fetch_and_add1_acquire(addr)
#  define AO_HAVE_short_fetch_and_add1_acquire_read
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_short_fetch_and_add1_acquire_read)
#    define AO_short_fetch_and_add1_dd_acquire_read(addr) \
        AO_short_fetch_and_add1_acquire_read(addr)
#    define AO_HAVE_short_fetch_and_add1_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_short_fetch_and_add1)
#    define AO_short_fetch_and_add1_dd_acquire_read(addr) \
        AO_short_fetch_and_add1(addr)
#    define AO_HAVE_short_fetch_and_add1_dd_acquire_read
#  endif
#endif

/* short_fetch_and_sub1 */

#if defined(AO_HAVE_short_fetch_and_add_full) &&\
    !defined(AO_HAVE_short_fetch_and_sub1_full)
#  define AO_short_fetch_and_sub1_full(addr) \
        AO_short_fetch_and_add_full(addr,(unsigned short)(-1))
#  define AO_HAVE_short_fetch_and_sub1_full
#endif
#if defined(AO_HAVE_short_fetch_and_add_release) &&\
    !defined(AO_HAVE_short_fetch_and_sub1_release)
#  define AO_short_fetch_and_sub1_release(addr) \
        AO_short_fetch_and_add_release(addr,(unsigned short)(-1))
#  define AO_HAVE_short_fetch_and_sub1_release
#endif
#if defined(AO_HAVE_short_fetch_and_add_acquire) &&\
    !defined(AO_HAVE_short_fetch_and_sub1_acquire)
#  define AO_short_fetch_and_sub1_acquire(addr) \
        AO_short_fetch_and_add_acquire(addr,(unsigned short)(-1))
#  define AO_HAVE_short_fetch_and_sub1_acquire
#endif
#if defined(AO_HAVE_short_fetch_and_add_write) &&\
    !defined(AO_HAVE_short_fetch_and_sub1_write)
#  define AO_short_fetch_and_sub1_write(addr) \
        AO_short_fetch_and_add_write(addr,(unsigned short)(-1))
#  define AO_HAVE_short_fetch_and_sub1_write
#endif
#if defined(AO_HAVE_short_fetch_and_add_read) &&\
    !defined(AO_HAVE_short_fetch_and_sub1_read)
#  define AO_short_fetch_and_sub1_read(addr) \
        AO_short_fetch_and_add_read(addr,(unsigned short)(-1))
#  define AO_HAVE_short_fetch_and_sub1_read
#endif
#if defined(AO_HAVE_short_fetch_and_add_release_write) &&\
    !defined(AO_HAVE_short_fetch_and_sub1_release_write)
#  define AO_short_fetch_and_sub1_release_write(addr) \
        AO_short_fetch_and_add_release_write(addr,(unsigned short)(-1))
#  define AO_HAVE_short_fetch_and_sub1_release_write
#endif
#if defined(AO_HAVE_short_fetch_and_add_acquire_read) &&\
    !defined(AO_HAVE_short_fetch_and_sub1_acquire_read)
#  define AO_short_fetch_and_sub1_acquire_read(addr) \
        AO_short_fetch_and_add_acquire_read(addr,(unsigned short)(-1))
#  define AO_HAVE_short_fetch_and_sub1_acquire_read
#endif
#if defined(AO_HAVE_short_fetch_and_add) &&\
    !defined(AO_HAVE_short_fetch_and_sub1)
#  define AO_short_fetch_and_sub1(addr) \
        AO_short_fetch_and_add(addr,(unsigned short)(-1))
#  define AO_HAVE_short_fetch_and_sub1
#endif

#if defined(AO_HAVE_short_fetch_and_sub1_full)
#  if !defined(AO_HAVE_short_fetch_and_sub1_release)
#    define AO_short_fetch_and_sub1_release(addr) \
         AO_short_fetch_and_sub1_full(addr)
#    define AO_HAVE_short_fetch_and_sub1_release
#  endif
#  if !defined(AO_HAVE_short_fetch_and_sub1_acquire)
#    define AO_short_fetch_and_sub1_acquire(addr) \
         AO_short_fetch_and_sub1_full(addr)
#    define AO_HAVE_short_fetch_and_sub1_acquire
#  endif
#  if !defined(AO_HAVE_short_fetch_and_sub1_write)
#    define AO_short_fetch_and_sub1_write(addr) \
         AO_short_fetch_and_sub1_full(addr)
#    define AO_HAVE_short_fetch_and_sub1_write
#  endif
#  if !defined(AO_HAVE_short_fetch_and_sub1_read)
#    define AO_short_fetch_and_sub1_read(addr) \
         AO_short_fetch_and_sub1_full(addr)
#    define AO_HAVE_short_fetch_and_sub1_read
#  endif
#endif /* AO_HAVE_short_fetch_and_sub1_full */

#if !defined(AO_HAVE_short_fetch_and_sub1) && \
    defined(AO_HAVE_short_fetch_and_sub1_release)
#  define AO_short_fetch_and_sub1(addr) \
        AO_short_fetch_and_sub1_release(addr)
#  define AO_HAVE_short_fetch_and_sub1
#endif
#if !defined(AO_HAVE_short_fetch_and_sub1) && \
    defined(AO_HAVE_short_fetch_and_sub1_acquire)
#  define AO_short_fetch_and_sub1(addr) \
        AO_short_fetch_and_sub1_acquire(addr)
#  define AO_HAVE_short_fetch_and_sub1
#endif
#if !defined(AO_HAVE_short_fetch_and_sub1) && \
    defined(AO_HAVE_short_fetch_and_sub1_write)
#  define AO_short_fetch_and_sub1(addr) \
        AO_short_fetch_and_sub1_write(addr)
#  define AO_HAVE_short_fetch_and_sub1
#endif
#if !defined(AO_HAVE_short_fetch_and_sub1) && \
    defined(AO_HAVE_short_fetch_and_sub1_read)
#  define AO_short_fetch_and_sub1(addr) \
        AO_short_fetch_and_sub1_read(addr)
#  define AO_HAVE_short_fetch_and_sub1
#endif

#if defined(AO_HAVE_short_fetch_and_sub1_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_short_fetch_and_sub1_full)
#  define AO_short_fetch_and_sub1_full(addr) \
        (AO_nop_full(), AO_short_fetch_and_sub1_acquire(addr))
#  define AO_HAVE_short_fetch_and_sub1_full
#endif

#if !defined(AO_HAVE_short_fetch_and_sub1_release_write) && \
    defined(AO_HAVE_short_fetch_and_sub1_write)
#  define AO_short_fetch_and_sub1_release_write(addr) \
        AO_short_fetch_and_sub1_write(addr)
#  define AO_HAVE_short_fetch_and_sub1_release_write
#endif
#if !defined(AO_HAVE_short_fetch_and_sub1_release_write) && \
    defined(AO_HAVE_short_fetch_and_sub1_release)
#  define AO_short_fetch_and_sub1_release_write(addr) \
        AO_short_fetch_and_sub1_release(addr)
#  define AO_HAVE_short_fetch_and_sub1_release_write
#endif
#if !defined(AO_HAVE_short_fetch_and_sub1_acquire_read) && \
    defined(AO_HAVE_short_fetch_and_sub1_read)
#  define AO_short_fetch_and_sub1_acquire_read(addr) \
        AO_short_fetch_and_sub1_read(addr)
#  define AO_HAVE_short_fetch_and_sub1_acquire_read
#endif
#if !defined(AO_HAVE_short_fetch_and_sub1_acquire_read) && \
    defined(AO_HAVE_short_fetch_and_sub1_acquire)
#  define AO_short_fetch_and_sub1_acquire_read(addr) \
        AO_short_fetch_and_sub1_acquire(addr)
#  define AO_HAVE_short_fetch_and_sub1_acquire_read
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_short_fetch_and_sub1_acquire_read)
#    define AO_short_fetch_and_sub1_dd_acquire_read(addr) \
        AO_short_fetch_and_sub1_acquire_read(addr)
#    define AO_HAVE_short_fetch_and_sub1_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_short_fetch_and_sub1)
#    define AO_short_fetch_and_sub1_dd_acquire_read(addr) \
        AO_short_fetch_and_sub1(addr)
#    define AO_HAVE_short_fetch_and_sub1_dd_acquire_read
#  endif
#endif

/* int_load */
#if defined(AO_HAVE_int_load_acquire) && !defined(AO_HAVE_int_load)
#  define AO_int_load(addr) AO_int_load_acquire(addr)
#  define AO_HAVE_int_load
#endif

#if defined(AO_HAVE_int_load_full) && !defined(AO_HAVE_int_load_acquire)
#  define AO_int_load_acquire(addr) AO_int_load_full(addr)
#  define AO_HAVE_int_load_acquire
#endif

#if defined(AO_HAVE_int_load_full) && !defined(AO_HAVE_int_load_read)
#  define AO_int_load_read(addr) AO_int_load_full(addr)
#  define AO_HAVE_int_load_read
#endif

#if !defined(AO_HAVE_int_load_acquire_read) && defined(AO_HAVE_int_load_acquire)
#  define AO_int_load_acquire_read(addr) AO_int_load_acquire(addr)
#  define AO_HAVE_int_load_acquire_read
#endif

#if defined(AO_HAVE_int_load) && defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_int_load_acquire)
   AO_INLINE unsigned int
   AO_int_load_acquire(const volatile unsigned int *addr)
   {
     unsigned int result = AO_int_load(addr);
     /* Acquire barrier would be useless, since the load could be delayed  */
     /* beyond it.                                                         */
     AO_nop_full();
     return result;
   }
#  define AO_HAVE_int_load_acquire
#endif

#if defined(AO_HAVE_int_load) && defined(AO_HAVE_nop_read) && \
    !defined(AO_HAVE_int_load_read)
   AO_INLINE unsigned int
   AO_int_load_read(const volatile unsigned int *addr)
   {
     unsigned int result = AO_int_load(addr);
     /* Acquire barrier would be useless, since the load could be delayed  */
     /* beyond it.                                                         */
     AO_nop_read();
     return result;
   }
#  define AO_HAVE_int_load_read
#endif

#if defined(AO_HAVE_int_load_acquire) && defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_int_load_full)
#  define AO_int_load_full(addr) (AO_nop_full(), AO_int_load_acquire(addr))
#  define AO_HAVE_int_load_full
#endif

#if !defined(AO_HAVE_int_load_acquire_read) && defined(AO_HAVE_int_load_read)
#  define AO_int_load_acquire_read(addr) AO_int_load_read(addr)
#  define AO_HAVE_int_load_acquire_read
#endif

#if defined(AO_HAVE_int_load_acquire_read) && !defined(AO_HAVE_int_load)
#  define AO_int_load(addr) AO_int_load_acquire_read(addr)
#  define AO_HAVE_int_load
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_int_load_acquire_read)
#    define AO_int_load_dd_acquire_read(addr) \
        AO_int_load_acquire_read(addr)
#    define AO_HAVE_int_load_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_int_load)
#    define AO_int_load_dd_acquire_read(addr) \
        AO_int_load(addr)
#    define AO_HAVE_int_load_dd_acquire_read
#  endif
#endif


/* int_store */

#if defined(AO_HAVE_int_store_release) && !defined(AO_HAVE_int_store)
#  define AO_int_store(addr, val) AO_int_store_release(addr,val)
#  define AO_HAVE_int_store
#endif

#if defined(AO_HAVE_int_store_full) && !defined(AO_HAVE_int_store_release)
#  define AO_int_store_release(addr,val) AO_int_store_full(addr,val)
#  define AO_HAVE_int_store_release
#endif

#if defined(AO_HAVE_int_store_full) && !defined(AO_HAVE_int_store_write)
#  define AO_int_store_write(addr,val) AO_int_store_full(addr,val)
#  define AO_HAVE_int_store_write
#endif

#if defined(AO_HAVE_int_store_release) && \
        !defined(AO_HAVE_int_store_release_write)
#  define AO_int_store_release_write(addr, val) \
        AO_int_store_release(addr,val)
#  define AO_HAVE_int_store_release_write
#endif

#if defined(AO_HAVE_int_store_write) && !defined(AO_HAVE_int_store)
#  define AO_int_store(addr, val) AO_int_store_write(addr,val)
#  define AO_HAVE_int_store
#endif

#if defined(AO_HAVE_int_store) && defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_int_store_release)
#  define AO_int_store_release(addr,val) \
        (AO_nop_full(), AO_int_store(addr,val))
#  define AO_HAVE_int_store_release
#endif

#if defined(AO_HAVE_nop_write) && defined(AO_HAVE_int_store) && \
     !defined(AO_HAVE_int_store_write)
#  define AO_int_store_write(addr, val) \
        (AO_nop_write(), AO_int_store(addr,val))
#  define AO_HAVE_int_store_write
#endif

#if defined(AO_HAVE_int_store_write) && \
     !defined(AO_HAVE_int_store_release_write)
#  define AO_int_store_release_write(addr, val) AO_int_store_write(addr,val)
#  define AO_HAVE_int_store_release_write
#endif

#if defined(AO_HAVE_int_store_release) && defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_int_store_full)
#  define AO_int_store_full(addr, val) \
        (AO_int_store_release(addr, val), AO_nop_full())
#  define AO_HAVE_int_store_full
#endif


/* int_fetch_and_add */
#if defined(AO_HAVE_int_compare_and_swap_full) && \
    !defined(AO_HAVE_int_fetch_and_add_full)
   AO_INLINE AO_t
   AO_int_fetch_and_add_full(volatile unsigned int *addr,
                               unsigned int incr)
   {
     unsigned int old;
     do
       {
         old = *addr;
       }
     while (!AO_int_compare_and_swap_full(addr, old, old+incr));
     return old;
   }
#  define AO_HAVE_int_fetch_and_add_full
#endif

#if defined(AO_HAVE_int_compare_and_swap_acquire) && \
    !defined(AO_HAVE_int_fetch_and_add_acquire)
   AO_INLINE AO_t
   AO_int_fetch_and_add_acquire(volatile unsigned int *addr,
                                  unsigned int incr)
   {
     unsigned int old;
     do
       {
         old = *addr;
       }
     while (!AO_int_compare_and_swap_acquire(addr, old, old+incr));
     return old;
   }
#  define AO_HAVE_int_fetch_and_add_acquire
#endif

#if defined(AO_HAVE_int_compare_and_swap_release) && \
    !defined(AO_HAVE_int_fetch_and_add_release)
   AO_INLINE AO_t
   AO_int_fetch_and_add_release(volatile unsigned int *addr,
                                  unsigned int incr)
   {
     unsigned int old;
     do
       {
         old = *addr;
       }
     while (!AO_int_compare_and_swap_release(addr, old, old+incr));
     return old;
   }
#  define AO_HAVE_int_fetch_and_add_release
#endif

#if defined(AO_HAVE_int_fetch_and_add_full)
#  if !defined(AO_HAVE_int_fetch_and_add_release)
#    define AO_int_fetch_and_add_release(addr, val) \
         AO_int_fetch_and_add_full(addr, val)
#    define AO_HAVE_int_fetch_and_add_release
#  endif
#  if !defined(AO_HAVE_int_fetch_and_add_acquire)
#    define AO_int_fetch_and_add_acquire(addr, val) \
         AO_int_fetch_and_add_full(addr, val)
#    define AO_HAVE_int_fetch_and_add_acquire
#  endif
#  if !defined(AO_HAVE_int_fetch_and_add_write)
#    define AO_int_fetch_and_add_write(addr, val) \
         AO_int_fetch_and_add_full(addr, val)
#    define AO_HAVE_int_fetch_and_add_write
#  endif
#  if !defined(AO_HAVE_int_fetch_and_add_read)
#    define AO_int_fetch_and_add_read(addr, val) \
         AO_int_fetch_and_add_full(addr, val)
#    define AO_HAVE_int_fetch_and_add_read
#  endif
#endif /* AO_HAVE_int_fetch_and_add_full */

#if !defined(AO_HAVE_int_fetch_and_add) && \
    defined(AO_HAVE_int_fetch_and_add_release)
#  define AO_int_fetch_and_add(addr, val) \
        AO_int_fetch_and_add_release(addr, val)
#  define AO_HAVE_int_fetch_and_add
#endif
#if !defined(AO_HAVE_int_fetch_and_add) && \
    defined(AO_HAVE_int_fetch_and_add_acquire)
#  define AO_int_fetch_and_add(addr, val) \
        AO_int_fetch_and_add_acquire(addr, val)
#  define AO_HAVE_int_fetch_and_add
#endif
#if !defined(AO_HAVE_int_fetch_and_add) && \
    defined(AO_HAVE_int_fetch_and_add_write)
#  define AO_int_fetch_and_add(addr, val) \
        AO_int_fetch_and_add_write(addr, val)
#  define AO_HAVE_int_fetch_and_add
#endif
#if !defined(AO_HAVE_int_fetch_and_add) && \
    defined(AO_HAVE_int_fetch_and_add_read)
#  define AO_int_fetch_and_add(addr, val) \
        AO_int_fetch_and_add_read(addr, val)
#  define AO_HAVE_int_fetch_and_add
#endif

#if defined(AO_HAVE_int_fetch_and_add_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_int_fetch_and_add_full)
#  define AO_int_fetch_and_add_full(addr, val) \
        (AO_nop_full(), AO_int_fetch_and_add_acquire(addr, val))
#endif

#if !defined(AO_HAVE_int_fetch_and_add_release_write) && \
    defined(AO_HAVE_int_fetch_and_add_write)
#  define AO_int_fetch_and_add_release_write(addr, val) \
        AO_int_fetch_and_add_write(addr, val)
#  define AO_HAVE_int_fetch_and_add_release_write
#endif
#if !defined(AO_HAVE_int_fetch_and_add_release_write) && \
    defined(AO_HAVE_int_fetch_and_add_release)
#  define AO_int_fetch_and_add_release_write(addr, val) \
        AO_int_fetch_and_add_release(addr, val)
#  define AO_HAVE_int_fetch_and_add_release_write
#endif
#if !defined(AO_HAVE_int_fetch_and_add_acquire_read) && \
    defined(AO_HAVE_int_fetch_and_add_read)
#  define AO_int_fetch_and_add_acquire_read(addr, val) \
        AO_int_fetch_and_add_read(addr, val)
#  define AO_HAVE_int_fetch_and_add_acquire_read
#endif
#if !defined(AO_HAVE_int_fetch_and_add_acquire_read) && \
    defined(AO_HAVE_int_fetch_and_add_acquire)
#  define AO_int_fetch_and_add_acquire_read(addr, val) \
        AO_int_fetch_and_add_acquire(addr, val)
#  define AO_HAVE_int_fetch_and_add_acquire_read
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_int_fetch_and_add_acquire_read)
#    define AO_int_fetch_and_add_dd_acquire_read(addr, val) \
        AO_int_fetch_and_add_acquire_read(addr, val)
#    define AO_HAVE_int_fetch_and_add_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_int_fetch_and_add)
#    define AO_int_fetch_and_add_dd_acquire_read(addr, val) \
        AO_int_fetch_and_add(addr, val)
#    define AO_HAVE_int_fetch_and_add_dd_acquire_read
#  endif
#endif

/* int_fetch_and_add1 */

#if defined(AO_HAVE_int_fetch_and_add_full) &&\
    !defined(AO_HAVE_int_fetch_and_add1_full)
#  define AO_int_fetch_and_add1_full(addr) \
        AO_int_fetch_and_add_full(addr,1)
#  define AO_HAVE_int_fetch_and_add1_full
#endif
#if defined(AO_HAVE_int_fetch_and_add_release) &&\
    !defined(AO_HAVE_int_fetch_and_add1_release)
#  define AO_int_fetch_and_add1_release(addr) \
        AO_int_fetch_and_add_release(addr,1)
#  define AO_HAVE_int_fetch_and_add1_release
#endif
#if defined(AO_HAVE_int_fetch_and_add_acquire) &&\
    !defined(AO_HAVE_int_fetch_and_add1_acquire)
#  define AO_int_fetch_and_add1_acquire(addr) \
        AO_int_fetch_and_add_acquire(addr,1)
#  define AO_HAVE_int_fetch_and_add1_acquire
#endif
#if defined(AO_HAVE_int_fetch_and_add_write) &&\
    !defined(AO_HAVE_int_fetch_and_add1_write)
#  define AO_int_fetch_and_add1_write(addr) \
        AO_int_fetch_and_add_write(addr,1)
#  define AO_HAVE_int_fetch_and_add1_write
#endif
#if defined(AO_HAVE_int_fetch_and_add_read) &&\
    !defined(AO_HAVE_int_fetch_and_add1_read)
#  define AO_int_fetch_and_add1_read(addr) \
        AO_int_fetch_and_add_read(addr,1)
#  define AO_HAVE_int_fetch_and_add1_read
#endif
#if defined(AO_HAVE_int_fetch_and_add_release_write) &&\
    !defined(AO_HAVE_int_fetch_and_add1_release_write)
#  define AO_int_fetch_and_add1_release_write(addr) \
        AO_int_fetch_and_add_release_write(addr,1)
#  define AO_HAVE_int_fetch_and_add1_release_write
#endif
#if defined(AO_HAVE_int_fetch_and_add_acquire_read) &&\
    !defined(AO_HAVE_int_fetch_and_add1_acquire_read)
#  define AO_int_fetch_and_add1_acquire_read(addr) \
        AO_int_fetch_and_add_acquire_read(addr,1)
#  define AO_HAVE_int_fetch_and_add1_acquire_read
#endif
#if defined(AO_HAVE_int_fetch_and_add) &&\
    !defined(AO_HAVE_int_fetch_and_add1)
#  define AO_int_fetch_and_add1(addr) \
        AO_int_fetch_and_add(addr,1)
#  define AO_HAVE_int_fetch_and_add1
#endif

#if defined(AO_HAVE_int_fetch_and_add1_full)
#  if !defined(AO_HAVE_int_fetch_and_add1_release)
#    define AO_int_fetch_and_add1_release(addr) \
         AO_int_fetch_and_add1_full(addr)
#    define AO_HAVE_int_fetch_and_add1_release
#  endif
#  if !defined(AO_HAVE_int_fetch_and_add1_acquire)
#    define AO_int_fetch_and_add1_acquire(addr) \
         AO_int_fetch_and_add1_full(addr)
#    define AO_HAVE_int_fetch_and_add1_acquire
#  endif
#  if !defined(AO_HAVE_int_fetch_and_add1_write)
#    define AO_int_fetch_and_add1_write(addr) \
         AO_int_fetch_and_add1_full(addr)
#    define AO_HAVE_int_fetch_and_add1_write
#  endif
#  if !defined(AO_HAVE_int_fetch_and_add1_read)
#    define AO_int_fetch_and_add1_read(addr) \
         AO_int_fetch_and_add1_full(addr)
#    define AO_HAVE_int_fetch_and_add1_read
#  endif
#endif /* AO_HAVE_int_fetch_and_add1_full */

#if !defined(AO_HAVE_int_fetch_and_add1) && \
    defined(AO_HAVE_int_fetch_and_add1_release)
#  define AO_int_fetch_and_add1(addr) \
        AO_int_fetch_and_add1_release(addr)
#  define AO_HAVE_int_fetch_and_add1
#endif
#if !defined(AO_HAVE_int_fetch_and_add1) && \
    defined(AO_HAVE_int_fetch_and_add1_acquire)
#  define AO_int_fetch_and_add1(addr) \
        AO_int_fetch_and_add1_acquire(addr)
#  define AO_HAVE_int_fetch_and_add1
#endif
#if !defined(AO_HAVE_int_fetch_and_add1) && \
    defined(AO_HAVE_int_fetch_and_add1_write)
#  define AO_int_fetch_and_add1(addr) \
        AO_int_fetch_and_add1_write(addr)
#  define AO_HAVE_int_fetch_and_add1
#endif
#if !defined(AO_HAVE_int_fetch_and_add1) && \
    defined(AO_HAVE_int_fetch_and_add1_read)
#  define AO_int_fetch_and_add1(addr) \
        AO_int_fetch_and_add1_read(addr)
#  define AO_HAVE_int_fetch_and_add1
#endif

#if defined(AO_HAVE_int_fetch_and_add1_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_int_fetch_and_add1_full)
#  define AO_int_fetch_and_add1_full(addr) \
        (AO_nop_full(), AO_int_fetch_and_add1_acquire(addr))
#  define AO_HAVE_int_fetch_and_add1_full
#endif

#if !defined(AO_HAVE_int_fetch_and_add1_release_write) && \
    defined(AO_HAVE_int_fetch_and_add1_write)
#  define AO_int_fetch_and_add1_release_write(addr) \
        AO_int_fetch_and_add1_write(addr)
#  define AO_HAVE_int_fetch_and_add1_release_write
#endif
#if !defined(AO_HAVE_int_fetch_and_add1_release_write) && \
    defined(AO_HAVE_int_fetch_and_add1_release)
#  define AO_int_fetch_and_add1_release_write(addr) \
        AO_int_fetch_and_add1_release(addr)
#  define AO_HAVE_int_fetch_and_add1_release_write
#endif
#if !defined(AO_HAVE_int_fetch_and_add1_acquire_read) && \
    defined(AO_HAVE_int_fetch_and_add1_read)
#  define AO_int_fetch_and_add1_acquire_read(addr) \
        AO_int_fetch_and_add1_read(addr)
#  define AO_HAVE_int_fetch_and_add1_acquire_read
#endif
#if !defined(AO_HAVE_int_fetch_and_add1_acquire_read) && \
    defined(AO_HAVE_int_fetch_and_add1_acquire)
#  define AO_int_fetch_and_add1_acquire_read(addr) \
        AO_int_fetch_and_add1_acquire(addr)
#  define AO_HAVE_int_fetch_and_add1_acquire_read
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_int_fetch_and_add1_acquire_read)
#    define AO_int_fetch_and_add1_dd_acquire_read(addr) \
        AO_int_fetch_and_add1_acquire_read(addr)
#    define AO_HAVE_int_fetch_and_add1_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_int_fetch_and_add1)
#    define AO_int_fetch_and_add1_dd_acquire_read(addr) \
        AO_int_fetch_and_add1(addr)
#    define AO_HAVE_int_fetch_and_add1_dd_acquire_read
#  endif
#endif

/* int_fetch_and_sub1 */

#if defined(AO_HAVE_int_fetch_and_add_full) &&\
    !defined(AO_HAVE_int_fetch_and_sub1_full)
#  define AO_int_fetch_and_sub1_full(addr) \
        AO_int_fetch_and_add_full(addr,(unsigned int)(-1))
#  define AO_HAVE_int_fetch_and_sub1_full
#endif
#if defined(AO_HAVE_int_fetch_and_add_release) &&\
    !defined(AO_HAVE_int_fetch_and_sub1_release)
#  define AO_int_fetch_and_sub1_release(addr) \
        AO_int_fetch_and_add_release(addr,(unsigned int)(-1))
#  define AO_HAVE_int_fetch_and_sub1_release
#endif
#if defined(AO_HAVE_int_fetch_and_add_acquire) &&\
    !defined(AO_HAVE_int_fetch_and_sub1_acquire)
#  define AO_int_fetch_and_sub1_acquire(addr) \
        AO_int_fetch_and_add_acquire(addr,(unsigned int)(-1))
#  define AO_HAVE_int_fetch_and_sub1_acquire
#endif
#if defined(AO_HAVE_int_fetch_and_add_write) &&\
    !defined(AO_HAVE_int_fetch_and_sub1_write)
#  define AO_int_fetch_and_sub1_write(addr) \
        AO_int_fetch_and_add_write(addr,(unsigned int)(-1))
#  define AO_HAVE_int_fetch_and_sub1_write
#endif
#if defined(AO_HAVE_int_fetch_and_add_read) &&\
    !defined(AO_HAVE_int_fetch_and_sub1_read)
#  define AO_int_fetch_and_sub1_read(addr) \
        AO_int_fetch_and_add_read(addr,(unsigned int)(-1))
#  define AO_HAVE_int_fetch_and_sub1_read
#endif
#if defined(AO_HAVE_int_fetch_and_add_release_write) &&\
    !defined(AO_HAVE_int_fetch_and_sub1_release_write)
#  define AO_int_fetch_and_sub1_release_write(addr) \
        AO_int_fetch_and_add_release_write(addr,(unsigned int)(-1))
#  define AO_HAVE_int_fetch_and_sub1_release_write
#endif
#if defined(AO_HAVE_int_fetch_and_add_acquire_read) &&\
    !defined(AO_HAVE_int_fetch_and_sub1_acquire_read)
#  define AO_int_fetch_and_sub1_acquire_read(addr) \
        AO_int_fetch_and_add_acquire_read(addr,(unsigned int)(-1))
#  define AO_HAVE_int_fetch_and_sub1_acquire_read
#endif
#if defined(AO_HAVE_int_fetch_and_add) &&\
    !defined(AO_HAVE_int_fetch_and_sub1)
#  define AO_int_fetch_and_sub1(addr) \
        AO_int_fetch_and_add(addr,(unsigned int)(-1))
#  define AO_HAVE_int_fetch_and_sub1
#endif

#if defined(AO_HAVE_int_fetch_and_sub1_full)
#  if !defined(AO_HAVE_int_fetch_and_sub1_release)
#    define AO_int_fetch_and_sub1_release(addr) \
         AO_int_fetch_and_sub1_full(addr)
#    define AO_HAVE_int_fetch_and_sub1_release
#  endif
#  if !defined(AO_HAVE_int_fetch_and_sub1_acquire)
#    define AO_int_fetch_and_sub1_acquire(addr) \
         AO_int_fetch_and_sub1_full(addr)
#    define AO_HAVE_int_fetch_and_sub1_acquire
#  endif
#  if !defined(AO_HAVE_int_fetch_and_sub1_write)
#    define AO_int_fetch_and_sub1_write(addr) \
         AO_int_fetch_and_sub1_full(addr)
#    define AO_HAVE_int_fetch_and_sub1_write
#  endif
#  if !defined(AO_HAVE_int_fetch_and_sub1_read)
#    define AO_int_fetch_and_sub1_read(addr) \
         AO_int_fetch_and_sub1_full(addr)
#    define AO_HAVE_int_fetch_and_sub1_read
#  endif
#endif /* AO_HAVE_int_fetch_and_sub1_full */

#if !defined(AO_HAVE_int_fetch_and_sub1) && \
    defined(AO_HAVE_int_fetch_and_sub1_release)
#  define AO_int_fetch_and_sub1(addr) \
        AO_int_fetch_and_sub1_release(addr)
#  define AO_HAVE_int_fetch_and_sub1
#endif
#if !defined(AO_HAVE_int_fetch_and_sub1) && \
    defined(AO_HAVE_int_fetch_and_sub1_acquire)
#  define AO_int_fetch_and_sub1(addr) \
        AO_int_fetch_and_sub1_acquire(addr)
#  define AO_HAVE_int_fetch_and_sub1
#endif
#if !defined(AO_HAVE_int_fetch_and_sub1) && \
    defined(AO_HAVE_int_fetch_and_sub1_write)
#  define AO_int_fetch_and_sub1(addr) \
        AO_int_fetch_and_sub1_write(addr)
#  define AO_HAVE_int_fetch_and_sub1
#endif
#if !defined(AO_HAVE_int_fetch_and_sub1) && \
    defined(AO_HAVE_int_fetch_and_sub1_read)
#  define AO_int_fetch_and_sub1(addr) \
        AO_int_fetch_and_sub1_read(addr)
#  define AO_HAVE_int_fetch_and_sub1
#endif

#if defined(AO_HAVE_int_fetch_and_sub1_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_int_fetch_and_sub1_full)
#  define AO_int_fetch_and_sub1_full(addr) \
        (AO_nop_full(), AO_int_fetch_and_sub1_acquire(addr))
#  define AO_HAVE_int_fetch_and_sub1_full
#endif

#if !defined(AO_HAVE_int_fetch_and_sub1_release_write) && \
    defined(AO_HAVE_int_fetch_and_sub1_write)
#  define AO_int_fetch_and_sub1_release_write(addr) \
        AO_int_fetch_and_sub1_write(addr)
#  define AO_HAVE_int_fetch_and_sub1_release_write
#endif
#if !defined(AO_HAVE_int_fetch_and_sub1_release_write) && \
    defined(AO_HAVE_int_fetch_and_sub1_release)
#  define AO_int_fetch_and_sub1_release_write(addr) \
        AO_int_fetch_and_sub1_release(addr)
#  define AO_HAVE_int_fetch_and_sub1_release_write
#endif
#if !defined(AO_HAVE_int_fetch_and_sub1_acquire_read) && \
    defined(AO_HAVE_int_fetch_and_sub1_read)
#  define AO_int_fetch_and_sub1_acquire_read(addr) \
        AO_int_fetch_and_sub1_read(addr)
#  define AO_HAVE_int_fetch_and_sub1_acquire_read
#endif
#if !defined(AO_HAVE_int_fetch_and_sub1_acquire_read) && \
    defined(AO_HAVE_int_fetch_and_sub1_acquire)
#  define AO_int_fetch_and_sub1_acquire_read(addr) \
        AO_int_fetch_and_sub1_acquire(addr)
#  define AO_HAVE_int_fetch_and_sub1_acquire_read
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_int_fetch_and_sub1_acquire_read)
#    define AO_int_fetch_and_sub1_dd_acquire_read(addr) \
        AO_int_fetch_and_sub1_acquire_read(addr)
#    define AO_HAVE_int_fetch_and_sub1_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_int_fetch_and_sub1)
#    define AO_int_fetch_and_sub1_dd_acquire_read(addr) \
        AO_int_fetch_and_sub1(addr)
#    define AO_HAVE_int_fetch_and_sub1_dd_acquire_read
#  endif
#endif


================================================
FILE: stms/estm-0.3.0/src/atomic_ops/generalize.h
================================================
/*
 * Copyright (c) 2003-2004 Hewlett-Packard Development Company, L.P.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

/*
 * Generalize atomic operations for atomic_ops.h.
 * Should not be included directly.
 *
 * We make no attempt to define useless operations, such as
 * AO_nop_acquire
 * AO_nop_release
 *
 * We have also so far neglected to define some others, which
 * do not appear likely to be useful, e.g. stores with acquire
 * or read barriers.
 *
 * This file is sometimes included twice by atomic_ops.h.
 * All definitions include explicit checks that we are not replacing
 * an earlier definition.  In general, more desirable expansions
 * appear earlier so that we are more likely to use them.
 *
 * We only make safe generalizations, except that by default we define
 * the ...dd_acquire_read operations to be equivalent to those without
 * a barrier.  On platforms for which this is unsafe, the platform-specific
 * file must define AO_NO_DD_ORDERING.
 */

#ifndef ATOMIC_OPS_H
# error Atomic_ops_generalize.h should not be included directly.
#endif

#if AO_CHAR_TS_T
# define AO_TS_COMPARE_AND_SWAP_FULL(a,o,n) \
         AO_char_compare_and_swap_full(a,o,n)
# define AO_TS_COMPARE_AND_SWAP_ACQUIRE(a,o,n) \
         AO_char_compare_and_swap_acquire(a,o,n)
# define AO_TS_COMPARE_AND_SWAP_RELEASE(a,o,n) \
         AO_char_compare_and_swap_release(a,o,n)
# define AO_TS_COMPARE_AND_SWAP(a,o,n) \
         AO_char_compare_and_swap(a,o,n)
#endif

#if AO_AO_TS_T
# define AO_TS_COMPARE_AND_SWAP_FULL(a,o,n) \
         AO_compare_and_swap_full(a,o,n)
# define AO_TS_COMPARE_AND_SWAP_ACQUIRE(a,o,n) \
         AO_compare_and_swap_acquire(a,o,n)
# define AO_TS_COMPARE_AND_SWAP_RELEASE(a,o,n) \
         AO_compare_and_swap_release(a,o,n)
# define AO_TS_COMPARE_AND_SWAP(a,o,n) \
         AO_compare_and_swap(a,o,n)
#endif

/* Generate test_and_set_full, if necessary and possible.       */
#if !defined(AO_HAVE_test_and_set) && \
    !defined(AO_HAVE_test_and_set_release) && \
    !defined(AO_HAVE_test_and_set_acquire) && \
    !defined(AO_HAVE_test_and_set_read) && \
    !defined(AO_HAVE_test_and_set_full)
#  if AO_AO_TS_T && defined(AO_HAVE_compare_and_swap_full) || \
      AO_CHAR_TS_T && defined(AO_HAVE_char_compare_and_swap_full)
     AO_INLINE AO_TS_VAL_t
     AO_test_and_set_full(volatile AO_TS_t *addr)
     {
       if (AO_TS_COMPARE_AND_SWAP_FULL(addr, AO_TS_CLEAR, AO_TS_SET))
         return AO_TS_CLEAR;
       else
         return AO_TS_SET;
     }
#    define AO_HAVE_test_and_set_full
#  endif /* AO_HAVE_compare_and_swap_full */

#  if AO_AO_TS_T && defined(AO_HAVE_compare_and_swap_acquire) || \
      AO_CHAR_TS_T && defined(AO_HAVE_char_compare_and_swap_acquire)
     AO_INLINE AO_TS_VAL_t
     AO_test_and_set_acquire(volatile AO_TS_t *addr)
     {
       if (AO_TS_COMPARE_AND_SWAP_ACQUIRE(addr, AO_TS_CLEAR, AO_TS_SET))
         return AO_TS_CLEAR;
       else
         return AO_TS_SET;
     }
#    define AO_HAVE_test_and_set_acquire
#  endif /* AO_HAVE_compare_and_swap_acquire */

#  if AO_AO_TS_T && defined(AO_HAVE_compare_and_swap_release) || \
      AO_CHAR_TS_T && defined(AO_HAVE_char_compare_and_swap_release)
     AO_INLINE AO_TS_VAL_t
     AO_test_and_set_release(volatile AO_TS_t *addr)
     {
       if (AO_TS_COMPARE_AND_SWAP_RELEASE(addr, AO_TS_CLEAR, AO_TS_SET))
         return AO_TS_CLEAR;
       else
         return AO_TS_SET;
     }
#    define AO_HAVE_test_and_set_release
#  endif /* AO_HAVE_compare_and_swap_release */

#  if AO_AO_TS_T && defined(AO_HAVE_compare_and_swap) || \
      AO_CHAR_TS_T && defined(AO_HAVE_char_compare_and_swap)
     AO_INLINE AO_TS_VAL_t
     AO_test_and_set(volatile AO_TS_t *addr)
     {
       if (AO_TS_COMPARE_AND_SWAP(addr, AO_TS_CLEAR, AO_TS_SET))
         return AO_TS_CLEAR;
       else
         return AO_TS_SET;
     }
#    define AO_HAVE_test_and_set
#  endif /* AO_HAVE_compare_and_swap */

#  if defined(AO_HAVE_test_and_set) && defined(AO_HAVE_nop_full) \
      && !defined(AO_HAVE_test_and_set_acquire)
     AO_INLINE AO_TS_VAL_t
     AO_test_and_set_acquire(volatile AO_TS_t *addr)
     {
       AO_TS_VAL_t result = AO_test_and_set(addr);
       AO_nop_full();
       return result;
     }
#    define AO_HAVE_test_and_set_acquire
#  endif

#endif /* No prior test and set */

/* Nop */
#if !defined(AO_HAVE_nop)
   AO_INLINE void AO_nop(void) {}
#  define AO_HAVE_nop
#endif

#if defined(AO_HAVE_test_and_set_full) && !defined(AO_HAVE_nop_full)
   AO_INLINE void
   AO_nop_full(void)
   {
     AO_TS_t dummy = AO_TS_INITIALIZER;
     AO_test_and_set_full(&dummy);
   }
#  define AO_HAVE_nop_full
#endif

#if defined(AO_HAVE_nop_acquire)
#  error AO_nop_acquire is useless: dont define.
#endif
#if defined(AO_HAVE_nop_release)
#  error AO_nop_release is useless: dont define.
#endif

#if defined(AO_HAVE_nop_full) && !defined(AO_HAVE_nop_read)
#  define AO_nop_read() AO_nop_full()
#  define AO_HAVE_nop_read
#endif

#if defined(AO_HAVE_nop_full) && !defined(AO_HAVE_nop_write)
#  define AO_nop_write() AO_nop_full()
#  define AO_HAVE_nop_write
#endif

/* Load */
#if defined(AO_HAVE_load_full) && !defined(AO_HAVE_load_acquire)
#  define AO_load_acquire(addr) AO_load_full(addr)
#  define AO_HAVE_load_acquire
#endif

#if defined(AO_HAVE_load_acquire) && !defined(AO_HAVE_load)
#  define AO_load(addr) AO_load_acquire(addr)
#  define AO_HAVE_load
#endif

#if defined(AO_HAVE_load_full) && !defined(AO_HAVE_load_read)
#  define AO_load_read(addr) AO_load_full(addr)
#  define AO_HAVE_load_read
#endif

#if !defined(AO_HAVE_load_acquire_read) && defined(AO_HAVE_load_acquire)
#  define AO_load_acquire_read(addr) AO_load_acquire(addr)
#  define AO_HAVE_load_acquire_read
#endif

#if defined(AO_HAVE_load) && defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_load_acquire)
   AO_INLINE AO_t
   AO_load_acquire(const volatile AO_t *addr)
   {
     AO_t result = AO_load(addr);
     /* Acquire barrier would be useless, since the load could be delayed  */
     /* beyond it.                                                         */
     AO_nop_full();
     return result;
   }
#  define AO_HAVE_load_acquire
#endif

#if defined(AO_HAVE_load) && defined(AO_HAVE_nop_read) && \
    !defined(AO_HAVE_load_read)
   AO_INLINE AO_t
   AO_load_read(const volatile AO_t *addr)
   {
     AO_t result = AO_load(addr);
     /* Acquire barrier would be useless, since the load could be delayed  */
     /* beyond it.                                                         */
     AO_nop_read();
     return result;
   }
#  define AO_HAVE_load_read
#endif

#if defined(AO_HAVE_load_acquire) && defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_load_full)
#  define AO_load_full(addr) (AO_nop_full(), AO_load_acquire(addr))
#  define AO_HAVE_load_full
#endif

#if !defined(AO_HAVE_load_acquire_read) && defined(AO_HAVE_load_read)
#  define AO_load_acquire_read(addr) AO_load_read(addr)
#  define AO_HAVE_load_acquire_read
#endif

#if defined(AO_HAVE_load_acquire_read) && !defined(AO_HAVE_load)
#  define AO_load(addr) AO_load_acquire_read(addr)
#  define AO_HAVE_load
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_load_acquire_read)
#    define AO_load_dd_acquire_read(addr) AO_load_acquire_read(addr)
#    define AO_HAVE_load_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_load)
#    define AO_load_dd_acquire_read(addr) AO_load(addr)
#    define AO_HAVE_load_dd_acquire_read
#  endif
#endif


/* Store */

#if defined(AO_HAVE_store_full) && !defined(AO_HAVE_store_release)
#  define AO_store_release(addr,val) AO_store_full(addr,val)
#  define AO_HAVE_store_release
#endif

#if defined(AO_HAVE_store_release) && !defined(AO_HAVE_store)
#  define AO_store(addr, val) AO_store_release(addr,val)
#  define AO_HAVE_store
#endif

#if defined(AO_HAVE_store_full) && !defined(AO_HAVE_store_write)
#  define AO_store_write(addr,val) AO_store_full(addr,val)
#  define AO_HAVE_store_write
#endif

#if defined(AO_HAVE_store_release) && !defined(AO_HAVE_store_release_write)
#  define AO_store_release_write(addr, val) AO_store_release(addr,val)
#  define AO_HAVE_store_release_write
#endif

#if defined(AO_HAVE_store_write) && !defined(AO_HAVE_store)
#  define AO_store(addr, val) AO_store_write(addr,val)
#  define AO_HAVE_store
#endif

#if defined(AO_HAVE_store) && defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_store_release)
#  define AO_store_release(addr,val) (AO_nop_full(), AO_store(addr,val))
#  define AO_HAVE_store_release
#endif

#if defined(AO_HAVE_nop_write) && defined(AO_HAVE_store) && \
     !defined(AO_HAVE_store_write)
#  define AO_store_write(addr, val) (AO_nop_write(), AO_store(addr,val))
#  define AO_HAVE_store_write
#endif

#if defined(AO_HAVE_store_write) && !defined(AO_HAVE_store_release_write)
#  define AO_store_release_write(addr, val) AO_store_write(addr,val)
#  define AO_HAVE_store_release_write
#endif

#if defined(AO_HAVE_store_release) && defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_store_full)
#  define AO_store_full(addr, val) (AO_store_release(addr, val), AO_nop_full())
#  define AO_HAVE_store_full
#endif

/* NEC LE-IT: Test and set */
#if defined(AO_HAVE_test_and_set) && \
        defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_test_and_set_release)
#       define AO_test_and_set_release(addr) \
        (AO_nop_full(), AO_test_and_set(addr))
#  define AO_HAVE_test_and_set_release
#endif

#if defined(AO_HAVE_test_and_set) && \
        defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_test_and_set_acquire)
AO_INLINE AO_TS_t
AO_test_and_set_acquire(volatile AO_TS_t *addr)
{
        AO_TS_t res = AO_test_and_set(addr);
        AO_nop_full();
        return res;
}
#  define AO_HAVE_test_and_set_acquire
#endif


/* Fetch_and_add */
/* We first try to implement fetch_and_add variants in terms    */
/* of the corresponding compare_and_swap variants to minimize   */
/* adding barriers.                                             */
#if defined(AO_HAVE_compare_and_swap_full) && \
    !defined(AO_HAVE_fetch_and_add_full)
   AO_INLINE AO_t
   AO_fetch_and_add_full(volatile AO_t *addr, AO_t incr)
   {
     AO_t old;
     do
       {
         old = *addr;
       }
     while (!AO_compare_and_swap_full(addr, old, old+incr));
     return old;
   }
#  define AO_HAVE_fetch_and_add_full
#endif

#if defined(AO_HAVE_compare_and_swap_acquire) && \
    !defined(AO_HAVE_fetch_and_add_acquire)
   AO_INLINE AO_t
   AO_fetch_and_add_acquire(volatile AO_t *addr, AO_t incr)
   {
     AO_t old;
     do
       {
         old = *addr;
       }
     while (!AO_compare_and_swap_acquire(addr, old, old+incr));
     return old;
   }
#  define AO_HAVE_fetch_and_add_acquire
#endif

#if defined(AO_HAVE_compare_and_swap_release) && \
    !defined(AO_HAVE_fetch_and_add_release)
   AO_INLINE AO_t
   AO_fetch_and_add_release(volatile AO_t *addr, AO_t incr)
   {
     AO_t old;
     do
       {
         old = *addr;
       }
     while (!AO_compare_and_swap_release(addr, old, old+incr));
     return old;
   }
#  define AO_HAVE_fetch_and_add_release
#endif

#if defined(AO_HAVE_compare_and_swap) && \
    !defined(AO_HAVE_fetch_and_add)
   AO_INLINE AO_t
   AO_fetch_and_add(volatile AO_t *addr, AO_t incr)
   {
     AO_t old;
     do
       {
         old = *addr;
       }
     while (!AO_compare_and_swap(addr, old, old+incr));
     return old;
   }
#  define AO_HAVE_fetch_and_add
#endif

#if defined(AO_HAVE_fetch_and_add_full)
#  if !defined(AO_HAVE_fetch_and_add_release)
#    define AO_fetch_and_add_release(addr, val) \
         AO_fetch_and_add_full(addr, val)
#    define AO_HAVE_fetch_and_add_release
#  endif
#  if !defined(AO_HAVE_fetch_and_add_acquire)
#    define AO_fetch_and_add_acquire(addr, val) \
         AO_fetch_and_add_full(addr, val)
#    define AO_HAVE_fetch_and_add_acquire
#  endif
#  if !defined(AO_HAVE_fetch_and_add_write)
#    define AO_fetch_and_add_write(addr, val) \
         AO_fetch_and_add_full(addr, val)
#    define AO_HAVE_fetch_and_add_write
#  endif
#  if !defined(AO_HAVE_fetch_and_add_read)
#    define AO_fetch_and_add_read(addr, val) \
         AO_fetch_and_add_full(addr, val)
#    define AO_HAVE_fetch_and_add_read
#  endif
#endif /* AO_HAVE_fetch_and_add_full */

#if !defined(AO_HAVE_fetch_and_add) && \
    defined(AO_HAVE_fetch_and_add_release)
#  define AO_fetch_and_add(addr, val) \
        AO_fetch_and_add_release(addr, val)
#  define AO_HAVE_fetch_and_add
#endif
#if !defined(AO_HAVE_fetch_and_add) && \
    defined(AO_HAVE_fetch_and_add_acquire)
#  define AO_fetch_and_add(addr, val) \
        AO_fetch_and_add_acquire(addr, val)
#  define AO_HAVE_fetch_and_add
#endif
#if !defined(AO_HAVE_fetch_and_add) && \
    defined(AO_HAVE_fetch_and_add_write)
#  define AO_fetch_and_add(addr, val) \
        AO_fetch_and_add_write(addr, val)
#  define AO_HAVE_fetch_and_add
#endif
#if !defined(AO_HAVE_fetch_and_add) && \
    defined(AO_HAVE_fetch_and_add_read)
#  define AO_fetch_and_add(addr, val) \
        AO_fetch_and_add_read(addr, val)
#  define AO_HAVE_fetch_and_add
#endif

#if defined(AO_HAVE_fetch_and_add_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_fetch_and_add_full)
#  define AO_fetch_and_add_full(addr, val) \
        (AO_nop_full(), AO_fetch_and_add_acquire(addr, val))
#  define AO_HAVE_fetch_and_add_full
#endif

#if !defined(AO_HAVE_fetch_and_add_release_write) && \
    defined(AO_HAVE_fetch_and_add_write)
#  define AO_fetch_and_add_release_write(addr, val) \
        AO_fetch_and_add_write(addr, val)
#  define AO_HAVE_fetch_and_add_release_write
#endif
#if !defined(AO_HAVE_fetch_and_add_release_write) && \
    defined(AO_HAVE_fetch_and_add_release)
#  define AO_fetch_and_add_release_write(addr, val) \
        AO_fetch_and_add_release(addr, val)
#  define AO_HAVE_fetch_and_add_release_write
#endif
#if !defined(AO_HAVE_fetch_and_add_acquire_read) && \
    defined(AO_HAVE_fetch_and_add_read)
#  define AO_fetch_and_add_acquire_read(addr, val) \
        AO_fetch_and_add_read(addr, val)
#  define AO_HAVE_fetch_and_add_acquire_read
#endif
#if !defined(AO_HAVE_fetch_and_add_acquire_read) && \
    defined(AO_HAVE_fetch_and_add_acquire)
#  define AO_fetch_and_add_acquire_read(addr, val) \
        AO_fetch_and_add_acquire(addr, val)
#  define AO_HAVE_fetch_and_add_acquire_read
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_fetch_and_add_acquire_read)
#    define AO_fetch_and_add_dd_acquire_read(addr, val) \
        AO_fetch_and_add_acquire_read(addr, val)
#    define AO_HAVE_fetch_and_add_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_fetch_and_add)
#    define AO_fetch_and_add_dd_acquire_read(addr, val) \
        AO_fetch_and_add(addr, val)
#    define AO_HAVE_fetch_and_add_dd_acquire_read
#  endif
#endif

/* Fetch_and_add1 */

#if defined(AO_HAVE_fetch_and_add_full) &&\
    !defined(AO_HAVE_fetch_and_add1_full)
#  define AO_fetch_and_add1_full(addr) AO_fetch_and_add_full(addr,1)
#  define AO_HAVE_fetch_and_add1_full
#endif
#if defined(AO_HAVE_fetch_and_add_release) &&\
    !defined(AO_HAVE_fetch_and_add1_release)
#  define AO_fetch_and_add1_release(addr) AO_fetch_and_add_release(addr,1)
#  define AO_HAVE_fetch_and_add1_release
#endif
#if defined(AO_HAVE_fetch_and_add_acquire) &&\
    !defined(AO_HAVE_fetch_and_add1_acquire)
#  define AO_fetch_and_add1_acquire(addr) AO_fetch_and_add_acquire(addr,1)
#  define AO_HAVE_fetch_and_add1_acquire
#endif
#if defined(AO_HAVE_fetch_and_add_write) &&\
    !defined(AO_HAVE_fetch_and_add1_write)
#  define AO_fetch_and_add1_write(addr) AO_fetch_and_add_write(addr,1)
#  define AO_HAVE_fetch_and_add1_write
#endif
#if defined(AO_HAVE_fetch_and_add_read) &&\
    !defined(AO_HAVE_fetch_and_add1_read)
#  define AO_fetch_and_add1_read(addr) AO_fetch_and_add_read(addr,1)
#  define AO_HAVE_fetch_and_add1_read
#endif
#if defined(AO_HAVE_fetch_and_add_release_write) &&\
    !defined(AO_HAVE_fetch_and_add1_release_write)
#  define AO_fetch_and_add1_release_write(addr) \
        AO_fetch_and_add_release_write(addr,1)
#  define AO_HAVE_fetch_and_add1_release_write
#endif
#if defined(AO_HAVE_fetch_and_add_acquire_read) &&\
    !defined(AO_HAVE_fetch_and_add1_acquire_read)
#  define AO_fetch_and_add1_acquire_read(addr) \
        AO_fetch_and_add_acquire_read(addr,1)
#  define AO_HAVE_fetch_and_add1_acquire_read
#endif
#if defined(AO_HAVE_fetch_and_add) &&\
    !defined(AO_HAVE_fetch_and_add1)
#  define AO_fetch_and_add1(addr) \
        AO_fetch_and_add(addr,1)
#  define AO_HAVE_fetch_and_add1
#endif

#if defined(AO_HAVE_fetch_and_add1_full)
#  if !defined(AO_HAVE_fetch_and_add1_release)
#    define AO_fetch_and_add1_release(addr) \
         AO_fetch_and_add1_full(addr)
#    define AO_HAVE_fetch_and_add1_release
#  endif
#  if !defined(AO_HAVE_fetch_and_add1_acquire)
#    define AO_fetch_and_add1_acquire(addr) \
         AO_fetch_and_add1_full(addr)
#    define AO_HAVE_fetch_and_add1_acquire
#  endif
#  if !defined(AO_HAVE_fetch_and_add1_write)
#    define AO_fetch_and_add1_write(addr) \
         AO_fetch_and_add1_full(addr)
#    define AO_HAVE_fetch_and_add1_write
#  endif
#  if !defined(AO_HAVE_fetch_and_add1_read)
#    define AO_fetch_and_add1_read(addr) \
         AO_fetch_and_add1_full(addr)
#    define AO_HAVE_fetch_and_add1_read
#  endif
#endif /* AO_HAVE_fetch_and_add1_full */

#if !defined(AO_HAVE_fetch_and_add1) && \
    defined(AO_HAVE_fetch_and_add1_release)
#  define AO_fetch_and_add1(addr) \
        AO_fetch_and_add1_release(addr)
#  define AO_HAVE_fetch_and_add1
#endif
#if !defined(AO_HAVE_fetch_and_add1) && \
    defined(AO_HAVE_fetch_and_add1_acquire)
#  define AO_fetch_and_add1(addr) \
        AO_fetch_and_add1_acquire(addr)
#  define AO_HAVE_fetch_and_add1
#endif
#if !defined(AO_HAVE_fetch_and_add1) && \
    defined(AO_HAVE_fetch_and_add1_write)
#  define AO_fetch_and_add1(addr) \
        AO_fetch_and_add1_write(addr)
#  define AO_HAVE_fetch_and_add1
#endif
#if !defined(AO_HAVE_fetch_and_add1) && \
    defined(AO_HAVE_fetch_and_add1_read)
#  define AO_fetch_and_add1(addr) \
        AO_fetch_and_add1_read(addr)
#  define AO_HAVE_fetch_and_add1
#endif

#if defined(AO_HAVE_fetch_and_add1_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_fetch_and_add1_full)
#  define AO_fetch_and_add1_full(addr) \
        (AO_nop_full(), AO_fetch_and_add1_acquire(addr))
#  define AO_HAVE_fetch_and_add1_full
#endif

#if !defined(AO_HAVE_fetch_and_add1_release_write) && \
    defined(AO_HAVE_fetch_and_add1_write)
#  define AO_fetch_and_add1_release_write(addr) \
        AO_fetch_and_add1_write(addr)
#  define AO_HAVE_fetch_and_add1_release_write
#endif
#if !defined(AO_HAVE_fetch_and_add1_release_write) && \
    defined(AO_HAVE_fetch_and_add1_release)
#  define AO_fetch_and_add1_release_write(addr) \
        AO_fetch_and_add1_release(addr)
#  define AO_HAVE_fetch_and_add1_release_write
#endif
#if !defined(AO_HAVE_fetch_and_add1_acquire_read) && \
    defined(AO_HAVE_fetch_and_add1_read)
#  define AO_fetch_and_add1_acquire_read(addr) \
        AO_fetch_and_add1_read(addr)
#  define AO_HAVE_fetch_and_add1_acquire_read
#endif
#if !defined(AO_HAVE_fetch_and_add1_acquire_read) && \
    defined(AO_HAVE_fetch_and_add1_acquire)
#  define AO_fetch_and_add1_acquire_read(addr) \
        AO_fetch_and_add1_acquire(addr)
#  define AO_HAVE_fetch_and_add1_acquire_read
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_fetch_and_add1_acquire_read)
#    define AO_fetch_and_add1_dd_acquire_read(addr) \
        AO_fetch_and_add1_acquire_read(addr)
#    define AO_HAVE_fetch_and_add1_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_fetch_and_add1)
#    define AO_fetch_and_add1_dd_acquire_read(addr) AO_fetch_and_add1(addr)
#    define AO_HAVE_fetch_and_add1_dd_acquire_read
#  endif
#endif

/* Fetch_and_sub1 */

#if defined(AO_HAVE_fetch_and_add_full) &&\
    !defined(AO_HAVE_fetch_and_sub1_full)
#  define AO_fetch_and_sub1_full(addr) AO_fetch_and_add_full(addr,(AO_t)(-1))
#  define AO_HAVE_fetch_and_sub1_full
#endif
#if defined(AO_HAVE_fetch_and_add_release) &&\
    !defined(AO_HAVE_fetch_and_sub1_release)
#  define AO_fetch_and_sub1_release(addr) \
        AO_fetch_and_add_release(addr,(AO_t)(-1))
#  define AO_HAVE_fetch_and_sub1_release
#endif
#if defined(AO_HAVE_fetch_and_add_acquire) &&\
    !defined(AO_HAVE_fetch_and_sub1_acquire)
#  define AO_fetch_and_sub1_acquire(addr) \
        AO_fetch_and_add_acquire(addr,(AO_t)(-1))
#  define AO_HAVE_fetch_and_sub1_acquire
#endif
#if defined(AO_HAVE_fetch_and_add_write) &&\
    !defined(AO_HAVE_fetch_and_sub1_write)
#  define AO_fetch_and_sub1_write(addr) \
        AO_fetch_and_add_write(addr,(AO_t)(-1))
#  define AO_HAVE_fetch_and_sub1_write
#endif
#if defined(AO_HAVE_fetch_and_add_read) &&\
    !defined(AO_HAVE_fetch_and_sub1_read)
#  define AO_fetch_and_sub1_read(addr) \
        AO_fetch_and_add_read(addr,(AO_t)(-1))
#  define AO_HAVE_fetch_and_sub1_read
#endif
#if defined(AO_HAVE_fetch_and_add_release_write) &&\
    !defined(AO_HAVE_fetch_and_sub1_release_write)
#  define AO_fetch_and_sub1_release_write(addr) \
        AO_fetch_and_add_release_write(addr,(AO_t)(-1))
#  define AO_HAVE_fetch_and_sub1_release_write
#endif
#if defined(AO_HAVE_fetch_and_add_acquire_read) &&\
    !defined(AO_HAVE_fetch_and_sub1_acquire_read)
#  define AO_fetch_and_sub1_acquire_read(addr) \
        AO_fetch_and_add_acquire_read(addr,(AO_t)(-1))
#  define AO_HAVE_fetch_and_sub1_acquire_read
#endif
#if defined(AO_HAVE_fetch_and_add) &&\
    !defined(AO_HAVE_fetch_and_sub1)
#  define AO_fetch_and_sub1(addr) \
        AO_fetch_and_add(addr,(AO_t)(-1))
#  define AO_HAVE_fetch_and_sub1
#endif

#if defined(AO_HAVE_fetch_and_sub1_full)
#  if !defined(AO_HAVE_fetch_and_sub1_release)
#    define AO_fetch_and_sub1_release(addr) \
         AO_fetch_and_sub1_full(addr)
#    define AO_HAVE_fetch_and_sub1_release
#  endif
#  if !defined(AO_HAVE_fetch_and_sub1_acquire)
#    define AO_fetch_and_sub1_acquire(addr) \
         AO_fetch_and_sub1_full(addr)
#    define AO_HAVE_fetch_and_sub1_acquire
#  endif
#  if !defined(AO_HAVE_fetch_and_sub1_write)
#    define AO_fetch_and_sub1_write(addr) \
         AO_fetch_and_sub1_full(addr)
#    define AO_HAVE_fetch_and_sub1_write
#  endif
#  if !defined(AO_HAVE_fetch_and_sub1_read)
#    define AO_fetch_and_sub1_read(addr) \
         AO_fetch_and_sub1_full(addr)
#    define AO_HAVE_fetch_and_sub1_read
#  endif
#endif /* AO_HAVE_fetch_and_sub1_full */

#if !defined(AO_HAVE_fetch_and_sub1) && \
    defined(AO_HAVE_fetch_and_sub1_release)
#  define AO_fetch_and_sub1(addr) \
        AO_fetch_and_sub1_release(addr)
#  define AO_HAVE_fetch_and_sub1
#endif
#if !defined(AO_HAVE_fetch_and_sub1) && \
    defined(AO_HAVE_fetch_and_sub1_acquire)
#  define AO_fetch_and_sub1(addr) \
        AO_fetch_and_sub1_acquire(addr)
#  define AO_HAVE_fetch_and_sub1
#endif
#if !defined(AO_HAVE_fetch_and_sub1) && \
    defined(AO_HAVE_fetch_and_sub1_write)
#  define AO_fetch_and_sub1(addr) \
        AO_fetch_and_sub1_write(addr)
#  define AO_HAVE_fetch_and_sub1
#endif
#if !defined(AO_HAVE_fetch_and_sub1) && \
    defined(AO_HAVE_fetch_and_sub1_read)
#  define AO_fetch_and_sub1(addr) \
        AO_fetch_and_sub1_read(addr)
#  define AO_HAVE_fetch_and_sub1
#endif

#if defined(AO_HAVE_fetch_and_sub1_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_fetch_and_sub1_full)
#  define AO_fetch_and_sub1_full(addr) \
        (AO_nop_full(), AO_fetch_and_sub1_acquire(addr))
#  define AO_HAVE_fetch_and_sub1_full
#endif

#if !defined(AO_HAVE_fetch_and_sub1_release_write) && \
    defined(AO_HAVE_fetch_and_sub1_write)
#  define AO_fetch_and_sub1_release_write(addr) \
        AO_fetch_and_sub1_write(addr)
#  define AO_HAVE_fetch_and_sub1_release_write
#endif
#if !defined(AO_HAVE_fetch_and_sub1_release_write) && \
    defined(AO_HAVE_fetch_and_sub1_release)
#  define AO_fetch_and_sub1_release_write(addr) \
        AO_fetch_and_sub1_release(addr)
#  define AO_HAVE_fetch_and_sub1_release_write
#endif
#if !defined(AO_HAVE_fetch_and_sub1_acquire_read) && \
    defined(AO_HAVE_fetch_and_sub1_read)
#  define AO_fetch_and_sub1_acquire_read(addr) \
        AO_fetch_and_sub1_read(addr)
#  define AO_HAVE_fetch_and_sub1_acquire_read
#endif
#if !defined(AO_HAVE_fetch_and_sub1_acquire_read) && \
    defined(AO_HAVE_fetch_and_sub1_acquire)
#  define AO_fetch_and_sub1_acquire_read(addr) \
        AO_fetch_and_sub1_acquire(addr)
#  define AO_HAVE_fetch_and_sub1_acquire_read
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_fetch_and_sub1_acquire_read)
#    define AO_fetch_and_sub1_dd_acquire_read(addr) \
        AO_fetch_and_sub1_acquire_read(addr)
#    define AO_HAVE_fetch_and_sub1_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_fetch_and_sub1)
#    define AO_fetch_and_sub1_dd_acquire_read(addr) AO_fetch_and_sub1(addr)
#    define AO_HAVE_fetch_and_sub1_dd_acquire_read
#  endif
#endif

/* Atomic or */
#if defined(AO_HAVE_compare_and_swap_full) && \
    !defined(AO_HAVE_or_full)
   AO_INLINE void
   AO_or_full(volatile AO_t *addr, AO_t incr)
   {
     AO_t old;
     do
       {
         old = *addr;
       }
     while (!AO_compare_and_swap_full(addr, old, (old | incr)));
   }
#  define AO_HAVE_or_full
#endif

#if defined(AO_HAVE_or_full)
#  if !defined(AO_HAVE_or_release)
#    define AO_or_release(addr, val) \
         AO_or_full(addr, val)
#    define AO_HAVE_or_release
#  endif
#  if !defined(AO_HAVE_or_acquire)
#    define AO_or_acquire(addr, val) \
         AO_or_full(addr, val)
#    define AO_HAVE_or_acquire
#  endif
#  if !defined(AO_HAVE_or_write)
#    define AO_or_write(addr, val) \
         AO_or_full(addr, val)
#    define AO_HAVE_or_write
#  endif
#  if !defined(AO_HAVE_or_read)
#    define AO_or_read(addr, val) \
         AO_or_full(addr, val)
#    define AO_HAVE_or_read
#  endif
#endif /* AO_HAVE_or_full */

#if !defined(AO_HAVE_or) && \
    defined(AO_HAVE_or_release)
#  define AO_or(addr, val) \
        AO_or_release(addr, val)
#  define AO_HAVE_or
#endif
#if !defined(AO_HAVE_or) && \
    defined(AO_HAVE_or_acquire)
#  define AO_or(addr, val) \
        AO_or_acquire(addr, val)
#  define AO_HAVE_or
#endif
#if !defined(AO_HAVE_or) && \
    defined(AO_HAVE_or_write)
#  define AO_or(addr, val) \
        AO_or_write(addr, val)
#  define AO_HAVE_or
#endif
#if !defined(AO_HAVE_or) && \
    defined(AO_HAVE_or_read)
#  define AO_or(addr, val) \
        AO_or_read(addr, val)
#  define AO_HAVE_or
#endif

#if defined(AO_HAVE_or_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_or_full)
#  define AO_or_full(addr, val) \
        (AO_nop_full(), AO_or_acquire(addr, val))
#endif

#if !defined(AO_HAVE_or_release_write) && \
    defined(AO_HAVE_or_write)
#  define AO_or_release_write(addr, val) \
        AO_or_write(addr, val)
#  define AO_HAVE_or_release_write
#endif
#if !defined(AO_HAVE_or_release_write) && \
    defined(AO_HAVE_or_release)
#  define AO_or_release_write(addr, val) \
        AO_or_release(addr, val)
#  define AO_HAVE_or_release_write
#endif
#if !defined(AO_HAVE_or_acquire_read) && \
    defined(AO_HAVE_or_read)
#  define AO_or_acquire_read(addr, val) \
        AO_or_read(addr, val)
#  define AO_HAVE_or_acquire_read
#endif
#if !defined(AO_HAVE_or_acquire_read) && \
    defined(AO_HAVE_or_acquire)
#  define AO_or_acquire_read(addr, val) \
        AO_or_acquire(addr, val)
#  define AO_HAVE_or_acquire_read
#endif

/* dd_aquire_read is meaningless.       */

/* Test_and_set */

#if defined(AO_HAVE_test_and_set_full)
#  if !defined(AO_HAVE_test_and_set_release)
#    define AO_test_and_set_release(addr) \
         AO_test_and_set_full(addr)
#    define AO_HAVE_test_and_set_release
#  endif
#  if !defined(AO_HAVE_test_and_set_acquire)
#    define AO_test_and_set_acquire(addr) \
         AO_test_and_set_full(addr)
#    define AO_HAVE_test_and_set_acquire
#  endif
#  if !defined(AO_HAVE_test_and_set_write)
#    define AO_test_and_set_write(addr) \
         AO_test_and_set_full(addr)
#    define AO_HAVE_test_and_set_write
#  endif
#  if !defined(AO_HAVE_test_and_set_read)
#    define AO_test_and_set_read(addr) \
         AO_test_and_set_full(addr)
#    define AO_HAVE_test_and_set_read
#  endif
#endif /* AO_HAVE_test_and_set_full */

#if !defined(AO_HAVE_test_and_set) && \
    defined(AO_HAVE_test_and_set_release)
#  define AO_test_and_set(addr) \
        AO_test_and_set_release(addr)
#  define AO_HAVE_test_and_set
#endif
#if !defined(AO_HAVE_test_and_set) && \
    defined(AO_HAVE_test_and_set_acquire)
#  define AO_test_and_set(addr) \
        AO_test_and_set_acquire(addr)
#  define AO_HAVE_test_and_set
#endif
#if !defined(AO_HAVE_test_and_set) && \
    defined(AO_HAVE_test_and_set_write)
#  define AO_test_and_set(addr) \
        AO_test_and_set_write(addr)
#  define AO_HAVE_test_and_set
#endif
#if !defined(AO_HAVE_test_and_set) && \
    defined(AO_HAVE_test_and_set_read)
#  define AO_test_and_set(addr) \
        AO_test_and_set_read(addr)
#  define AO_HAVE_test_and_set
#endif

#if defined(AO_HAVE_test_and_set_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_test_and_set_full)
#  define AO_test_and_set_full(addr) \
        (AO_nop_full(), AO_test_and_set_acquire(addr))
#  define AO_HAVE_test_and_set_full
#endif

#if !defined(AO_HAVE_test_and_set_release_write) && \
    defined(AO_HAVE_test_and_set_write)
#  define AO_test_and_set_release_write(addr) \
        AO_test_and_set_write(addr)
#  define AO_HAVE_test_and_set_release_write
#endif
#if !defined(AO_HAVE_test_and_set_release_write) && \
    defined(AO_HAVE_test_and_set_release)
#  define AO_test_and_set_release_write(addr) \
        AO_test_and_set_release(addr)
#  define AO_HAVE_test_and_set_release_write
#endif
#if !defined(AO_HAVE_test_and_set_acquire_read) && \
    defined(AO_HAVE_test_and_set_read)
#  define AO_test_and_set_acquire_read(addr) \
        AO_test_and_set_read(addr)
#  define AO_HAVE_test_and_set_acquire_read
#endif
#if !defined(AO_HAVE_test_and_set_acquire_read) && \
    defined(AO_HAVE_test_and_set_acquire)
#  define AO_test_and_set_acquire_read(addr) \
        AO_test_and_set_acquire(addr)
#  define AO_HAVE_test_and_set_acquire_read
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_test_and_set_acquire_read)
#    define AO_test_and_set_dd_acquire_read(addr) \
        AO_test_and_set_acquire_read(addr)
#    define AO_HAVE_test_and_set_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_test_and_set)
#    define AO_test_and_set_dd_acquire_read(addr) AO_test_and_set(addr)
#    define AO_HAVE_test_and_set_dd_acquire_read
#  endif
#endif

/* Compare_and_swap */
#if defined(AO_HAVE_compare_and_swap) && defined(AO_HAVE_nop_full)\
    && !defined(AO_HAVE_compare_and_swap_acquire)
   AO_INLINE int
   AO_compare_and_swap_acquire(volatile AO_t *addr, AO_t old, AO_t new_val)
   {
     int result = AO_compare_and_swap(addr, old, new_val);
     AO_nop_full();
     return result;
   }
#  define AO_HAVE_compare_and_swap_acquire
#endif
#if defined(AO_HAVE_compare_and_swap) && defined(AO_HAVE_nop_full)\
    && !defined(AO_HAVE_compare_and_swap_release)
#  define AO_compare_and_swap_release(addr, old, new_val) \
        (AO_nop_full(), AO_compare_and_swap(addr, old, new_val))
#  define AO_HAVE_compare_and_swap_release
#endif
#if defined(AO_HAVE_compare_and_swap_full)
#  if !defined(AO_HAVE_compare_and_swap_release)
#    define AO_compare_and_swap_release(addr, old, new_val) \
         AO_compare_and_swap_full(addr, old, new_val)
#    define AO_HAVE_compare_and_swap_release
#  endif
#  if !defined(AO_HAVE_compare_and_swap_acquire)
#    define AO_compare_and_swap_acquire(addr, old, new_val) \
         AO_compare_and_swap_full(addr, old, new_val)
#    define AO_HAVE_compare_and_swap_acquire
#  endif
#  if !defined(AO_HAVE_compare_and_swap_write)
#    define AO_compare_and_swap_write(addr, old, new_val) \
         AO_compare_and_swap_full(addr, old, new_val)
#    define AO_HAVE_compare_and_swap_write
#  endif
#  if !defined(AO_HAVE_compare_and_swap_read)
#    define AO_compare_and_swap_read(addr, old, new_val) \
         AO_compare_and_swap_full(addr, old, new_val)
#    define AO_HAVE_compare_and_swap_read
#  endif
#endif /* AO_HAVE_compare_and_swap_full */

#if !defined(AO_HAVE_compare_and_swap) && \
    defined(AO_HAVE_compare_and_swap_release)
#  define AO_compare_and_swap(addr, old, new_val) \
        AO_compare_and_swap_release(addr, old, new_val)
#  define AO_HAVE_compare_and_swap
#endif
#if !defined(AO_HAVE_compare_and_swap) && \
    defined(AO_HAVE_compare_and_swap_acquire)
#  define AO_compare_and_swap(addr, old, new_val) \
        AO_compare_and_swap_acquire(addr, old, new_val)
#  define AO_HAVE_compare_and_swap
#endif
#if !defined(AO_HAVE_compare_and_swap) && \
    defined(AO_HAVE_compare_and_swap_write)
#  define AO_compare_and_swap(addr, old, new_val) \
        AO_compare_and_swap_write(addr, old, new_val)
#  define AO_HAVE_compare_and_swap
#endif
#if !defined(AO_HAVE_compare_and_swap) && \
    defined(AO_HAVE_compare_and_swap_read)
#  define AO_compare_and_swap(addr, old, new_val) \
        AO_compare_and_swap_read(addr, old, new_val)
#  define AO_HAVE_compare_and_swap
#endif

#if defined(AO_HAVE_compare_and_swap_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_compare_and_swap_full)
#  define AO_compare_and_swap_full(addr, old, new_val) \
        (AO_nop_full(), AO_compare_and_swap_acquire(addr, old, new_val))
#  define AO_HAVE_compare_and_swap_full
#endif

#if !defined(AO_HAVE_compare_and_swap_release_write) && \
    defined(AO_HAVE_compare_and_swap_write)
#  define AO_compare_and_swap_release_write(addr, old, new_val) \
        AO_compare_and_swap_write(addr, old, new_val)
#  define AO_HAVE_compare_and_swap_release_write
#endif
#if !defined(AO_HAVE_compare_and_swap_release_write) && \
    defined(AO_HAVE_compare_and_swap_release)
#  define AO_compare_and_swap_release_write(addr, old, new_val) \
        AO_compare_and_swap_release(addr, old, new_val)
#  define AO_HAVE_compare_and_swap_release_write
#endif
#if !defined(AO_HAVE_compare_and_swap_acquire_read) && \
    defined(AO_HAVE_compare_and_swap_read)
#  define AO_compare_and_swap_acquire_read(addr, old, new_val) \
        AO_compare_and_swap_read(addr, old, new_val)
#  define AO_HAVE_compare_and_swap_acquire_read
#endif
#if !defined(AO_HAVE_compare_and_swap_acquire_read) && \
    defined(AO_HAVE_compare_and_swap_acquire)
#  define AO_compare_and_swap_acquire_read(addr, old, new_val) \
        AO_compare_and_swap_acquire(addr, old, new_val)
#  define AO_HAVE_compare_and_swap_acquire_read
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_compare_and_swap_acquire_read)
#    define AO_compare_and_swap_dd_acquire_read(addr, old, new_val) \
        AO_compare_and_swap_acquire_read(addr, old, new_val)
#    define AO_HAVE_compare_and_swap_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_compare_and_swap)
#    define AO_compare_and_swap_dd_acquire_read(addr, old, new_val) \
        AO_compare_and_swap(addr, old, new_val)
#    define AO_HAVE_compare_and_swap_dd_acquire_read
#  endif
#endif

#include "generalize-small.h"

/* Compare_double_and_swap_double */
#if defined(AO_HAVE_compare_double_and_swap_double) && defined(AO_HAVE_nop_full)\
    && !defined(AO_HAVE_compare_double_and_swap_double_acquire)
   AO_INLINE int
   AO_compare_double_and_swap_double_acquire(volatile AO_double_t *addr,
                                             AO_t o1, AO_t o2,
                                             AO_t n1, AO_t n2)
   {
     int result = AO_compare_double_and_swap_double(addr, o1, o2, n1, n2);
     AO_nop_full();
     return result;
   }
#  define AO_HAVE_compare_double_and_swap_double_acquire
#endif
#if defined(AO_HAVE_compare_double_and_swap_double) \
    && defined(AO_HAVE_nop_full)\
    && !defined(AO_HAVE_compare_double_and_swap_double_release)
#  define AO_compare_double_and_swap_double_release(addr, o1, o2, n1, n2) \
        (AO_nop_full(), AO_compare_double_and_swap_double(addr, o1, o2, n1, n2))
#  define AO_HAVE_compare_double_and_swap_double_release
#endif
#if defined(AO_HAVE_compare_double_and_swap_double_full)
#  if !defined(AO_HAVE_compare_double_and_swap_double_release)
#    define AO_compare_double_and_swap_double_release(addr, o1, o2, n1, n2) \
         AO_compare_double_and_swap_double_full(addr, o1, o2, n1, n2)
#    define AO_HAVE_compare_double_and_swap_double_release
#  endif
#  if !defined(AO_HAVE_compare_double_and_swap_double_acquire)
#    define AO_compare_double_and_swap_double_acquire(addr, o1, o2, n1, n2) \
         AO_compare_double_and_swap_double_full(addr, o1, o2, n1, n2)
#    define AO_HAVE_compare_double_and_swap_double_acquire
#  endif
#  if !defined(AO_HAVE_compare_double_and_swap_double_write)
#    define AO_compare_double_and_swap_double_write(addr, o1, o2, n1, n2) \
         AO_compare_double_and_swap_double_full(addr, o1, o2, n1, n2)
#    define AO_HAVE_compare_double_and_swap_double_write
#  endif
#  if !defined(AO_HAVE_compare_double_and_swap_double_read)
#    define AO_compare_double_and_swap_double_read(addr, o1, o2, n1, n2) \
         AO_compare_double_and_swap_double_full(addr, o1, o2, n1, n2)
#    define AO_HAVE_compare_double_and_swap_double_read
#  endif
#endif /* AO_HAVE_compare_double_and_swap_double_full */

#if !defined(AO_HAVE_compare_double_and_swap_double) && \
    defined(AO_HAVE_compare_double_and_swap_double_release)
#  define AO_compare_double_and_swap_double(addr, o1, o2, n1, n2) \
        AO_compare_double_and_swap_double_release(addr, o1, o2, n1, n2)
#  define AO_HAVE_compare_double_and_swap_double
#endif
#if !defined(AO_HAVE_compare_double_and_swap_double) && \
    defined(AO_HAVE_compare_double_and_swap_double_acquire)
#  define AO_compare_double_and_swap_double(addr, o1, o2, n1, n2) \
        AO_compare_double_and_swap_double_acquire(addr, o1, o2, n1, n2)
#  define AO_HAVE_compare_double_and_swap_double
#endif
#if !defined(AO_HAVE_compare_double_and_swap_double) && \
    defined(AO_HAVE_compare_double_and_swap_double_write)
#  define AO_compare_double_and_swap_double(addr, o1, o2, n1, n2) \
        AO_compare_double_and_swap_double_write(addr, o1, o2, n1, n2)
#  define AO_HAVE_compare_double_and_swap_double
#endif
#if !defined(AO_HAVE_compare_double_and_swap_double) && \
    defined(AO_HAVE_compare_double_and_swap_double_read)
#  define AO_compare_double_and_swap_double(addr, o1, o2, n1, n2) \
        AO_compare_double_and_swap_double_read(addr, o1, o2, n1, n2)
#  define AO_HAVE_compare_double_and_swap_double
#endif

#if defined(AO_HAVE_compare_double_and_swap_double_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_compare_double_and_swap_double_full)
#  define AO_compare_double_and_swap_double_full(addr, o1, o2, n1, n2) \
        (AO_nop_full(), AO_compare_double_and_swap_double_acquire(addr, o1, o2, n1, n2))
#  define AO_HAVE_compare_double_and_swap_double_full
#endif

#if !defined(AO_HAVE_compare_double_and_swap_double_release_write) && \
    defined(AO_HAVE_compare_double_and_swap_double_write)
#  define AO_compare_double_and_swap_double_release_write(addr, o1, o2, n1, n2) \
        AO_compare_double_and_swap_double_write(addr, o1, o2, n1, n2)
#  define AO_HAVE_compare_double_and_swap_double_release_write
#endif
#if !defined(AO_HAVE_compare_double_and_swap_double_release_write) && \
    defined(AO_HAVE_compare_double_and_swap_double_release)
#  define AO_compare_double_and_swap_double_release_write(addr, o1, o2, n1, n2) \
        AO_compare_double_and_swap_double_release(addr, o1, o2, n1, n2)
#  define AO_HAVE_compare_double_and_swap_double_release_write
#endif
#if !defined(AO_HAVE_compare_double_and_swap_double_acquire_read) && \
    defined(AO_HAVE_compare_double_and_swap_double_read)
#  define AO_compare_double_and_swap_double_acquire_read(addr, o1, o2, n1, n2) \
        AO_compare_double_and_swap_double_read(addr, o1, o2, n1, n2)
#  define AO_HAVE_compare_double_and_swap_double_acquire_read
#endif
#if !defined(AO_HAVE_compare_double_and_swap_double_acquire_read) && \
    defined(AO_HAVE_compare_double_and_swap_double_acquire)
#  define AO_compare_double_and_swap_double_acquire_read(addr, o1, o2, n1, n2) \
        AO_compare_double_and_swap_double_acquire(addr, o1, o2, n1, n2)
#  define AO_HAVE_compare_double_and_swap_double_acquire_read
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_compare_double_and_swap_double_acquire_read)
#    define AO_compare_double_and_swap_double_dd_acquire_read(addr, o1, o2, n1, n2) \
        AO_compare_double_and_swap_double_acquire_read(addr, o1, o2, n1, n2)
#    define AO_HAVE_compare_double_and_swap_double_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_compare_double_and_swap_double)
#    define AO_compare_double_and_swap_double_dd_acquire_read(addr, o1, o2, n1, n2) \
        AO_compare_double_and_swap_double(addr, o1, o2, n1, n2)
#    define AO_HAVE_compare_double_and_swap_double_dd_acquire_read
#  endif
#endif

/* Compare_and_swap_double */
#if defined(AO_HAVE_compare_and_swap_double) && defined(AO_HAVE_nop_full)\
    && !defined(AO_HAVE_compare_and_swap_double_acquire)
   AO_INLINE int
   AO_compare_and_swap_double_acquire(volatile AO_double_t *addr,
                                             AO_t o1,
                                             AO_t n1, AO_t n2)
   {
     int result = AO_compare_and_swap_double(addr, o1, n1, n2);
     AO_nop_full();
     return result;
   }
#  define AO_HAVE_compare_and_swap_double_acquire
#endif
#if defined(AO_HAVE_compare_and_swap_double) \
    && defined(AO_HAVE_nop_full)\
    && !defined(AO_HAVE_compare_and_swap_double_release)
#  define AO_compare_and_swap_double_release(addr, o1, n1, n2) \
        (AO_nop_full(), AO_compare_and_swap_double(addr, o1, n1, n2))
#  define AO_HAVE_compare_and_swap_double_release
#endif
#if defined(AO_HAVE_compare_and_swap_double_full)
#  if !defined(AO_HAVE_compare_and_swap_double_release)
#    define AO_compare_and_swap_double_release(addr, o1, n1, n2) \
         AO_compare_and_swap_double_full(addr, o1, n1, n2)
#    define AO_HAVE_compare_and_swap_double_release
#  endif
#  if !defined(AO_HAVE_compare_and_swap_double_acquire)
#    define AO_compare_and_swap_double_acquire(addr, o1, n1, n2) \
         AO_compare_and_swap_double_full(addr, o1, n1, n2)
#    define AO_HAVE_compare_and_swap_double_acquire
#  endif
#  if !defined(AO_HAVE_compare_and_swap_double_write)
#    define AO_compare_and_swap_double_write(addr, o1, n1, n2) \
         AO_compare_and_swap_double_full(addr, o1, n1, n2)
#    define AO_HAVE_compare_and_swap_double_write
#  endif
#  if !defined(AO_HAVE_compare_and_swap_double_read)
#    define AO_compare_and_swap_double_read(addr, o1, n1, n2) \
         AO_compare_and_swap_double_full(addr, o1, n1, n2)
#    define AO_HAVE_compare_and_swap_double_read
#  endif
#endif /* AO_HAVE_compare_and_swap_double_full */

#if !defined(AO_HAVE_compare_and_swap_double) && \
    defined(AO_HAVE_compare_and_swap_double_release)
#  define AO_compare_and_swap_double(addr, o1, n1, n2) \
        AO_compare_and_swap_double_release(addr, o1, n1, n2)
#  define AO_HAVE_compare_and_swap_double
#endif
#if !defined(AO_HAVE_compare_and_swap_double) && \
    defined(AO_HAVE_compare_and_swap_double_acquire)
#  define AO_compare_and_swap_double(addr, o1, n1, n2) \
        AO_compare_and_swap_double_acquire(addr, o1, n1, n2)
#  define AO_HAVE_compare_and_swap_double
#endif
#if !defined(AO_HAVE_compare_and_swap_double) && \
    defined(AO_HAVE_compare_and_swap_double_write)
#  define AO_compare_and_swap_double(addr, o1, n1, n2) \
        AO_compare_and_swap_double_write(addr, o1, n1, n2)
#  define AO_HAVE_compare_and_swap_double
#endif
#if !defined(AO_HAVE_compare_and_swap_double) && \
    defined(AO_HAVE_compare_and_swap_double_read)
#  define AO_compare_and_swap_double(addr, o1, n1, n2) \
        AO_compare_and_swap_double_read(addr, o1, n1, n2)
#  define AO_HAVE_compare_and_swap_double
#endif

#if defined(AO_HAVE_compare_and_swap_double_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_compare_and_swap_double_full)
#  define AO_compare_and_swap_double_full(addr, o1, n1, n2) \
        (AO_nop_full(), AO_compare_and_swap_double_acquire(addr, o1, n1, n2))
#  define AO_HAVE_compare_and_swap_double_full
#endif

#if !defined(AO_HAVE_compare_and_swap_double_release_write) && \
    defined(AO_HAVE_compare_and_swap_double_write)
#  define AO_compare_and_swap_double_release_write(addr, o1, n1, n2) \
        AO_compare_and_swap_double_write(addr, o1, n1, n2)
#  define AO_HAVE_compare_and_swap_double_release_write
#endif
#if !defined(AO_HAVE_compare_and_swap_double_release_write) && \
    defined(AO_HAVE_compare_and_swap_double_release)
#  define AO_compare_and_swap_double_release_write(addr, o1, n1, n2) \
        AO_compare_and_swap_double_release(addr, o1, n1, n2)
#  define AO_HAVE_compare_and_swap_double_release_write
#endif
#if !defined(AO_HAVE_compare_and_swap_double_acquire_read) && \
    defined(AO_HAVE_compare_and_swap_double_read)
#  define AO_compare_and_swap_double_acquire_read(addr, o1, n1, n2) \
        AO_compare_and_swap_double_read(addr, o1, n1, n2)
#  define AO_HAVE_compare_and_swap_double_acquire_read
#endif
#if !defined(AO_HAVE_compare_and_swap_double_acquire_read) && \
    defined(AO_HAVE_compare_and_swap_double_acquire)
#  define AO_compare_and_swap_double_acquire_read(addr, o1, n1, n2) \
        AO_compare_and_swap_double_acquire(addr, o1, n1, n2)
#  define AO_HAVE_compare_and_swap_double_acquire_read
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_compare_and_swap_double_acquire_read)
#    define AO_compare_and_swap_double_dd_acquire_read(addr, o1, n1, n2) \
        AO_compare_and_swap_double_acquire_read(addr, o1, n1, n2)
#    define AO_HAVE_compare_and_swap_double_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_compare_and_swap_double)
#    define AO_compare_and_swap_double_dd_acquire_read(addr, o1, n1, n2) \
        AO_compare_and_swap_double(addr, o1, n1, n2)
#    define AO_HAVE_compare_and_swap_double_dd_acquire_read
#  endif
#endif

/* NEC LE-IT: Convenience functions for AO_double compare and swap which */
/* types and reads easier in code                                        */
#if defined(AO_HAVE_compare_double_and_swap_double_release) && \
    !defined(AO_HAVE_double_compare_and_swap_release)
AO_INLINE int
AO_double_compare_and_swap_release(volatile AO_double_t *addr,
                                   AO_double_t old_val, AO_double_t new_val)
{
        return AO_compare_double_and_swap_double_release(addr,
                                                         old_val.AO_val1, old_val.AO_val2,
                                                         new_val.AO_val1, new_val.AO_val2);
}
#define AO_HAVE_double_compare_and_swap_release
#endif

#if defined(AO_HAVE_compare_double_and_swap_double_acquire) && \
    !defined(AO_HAVE_double_compare_and_swap_acquire)
AO_INLINE int
AO_double_compare_and_swap_acquire(volatile AO_double_t *addr,
                                   AO_double_t old_val, AO_double_t new_val)
{
        return AO_compare_double_and_swap_double_acquire(addr,
                                                         old_val.AO_val1, old_val.AO_val2,
                                                         new_val.AO_val1, new_val.AO_val2);
}
#define AO_HAVE_double_compare_and_swap_acquire
#endif

#if defined(AO_HAVE_compare_double_and_swap_double_full) && \
    !defined(AO_HAVE_double_compare_and_swap_full)
AO_INLINE int
AO_double_compare_and_swap_full(volatile AO_double_t *addr,
                                         AO_double_t old_val, AO_double_t new_val)
{
        return AO_compare_double_and_swap_double_full(addr,
                                                      old_val.AO_val1, old_val.AO_val2,
                                                      new_val.AO_val1, new_val.AO_val2);
}
#define AO_HAVE_double_compare_and_swap_full
#endif


================================================
FILE: stms/estm-0.3.0/src/atomic_ops/ia64.h
================================================
/*
 * Copyright (c) 2003 Hewlett-Packard Development Company, L.P.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "./aligned_atomic_load_store.h"

#include "./all_acquire_release_volatile.h"

#include "./test_and_set_t_is_char.h"

#ifdef _ILP32
  /* 32-bit HP/UX code. */
  /* This requires pointer "swizzling".  Pointers need to be expanded   */
  /* to 64 bits using the addp4 instruction before use.  This makes it  */
  /* hard to share code, but we try anyway.                             */
# define AO_LEN "4"
  /* We assume that addr always appears in argument position 1 in asm   */
  /* code.  If it is clobbered due to swizzling, we also need it in     */
  /* second position.  Any later arguments are referenced symbolically, */
  /* so that we don't have to worry about their position.  This requires*/
  /* gcc 3.1, but you shouldn't be using anything older than that on    */
  /* IA64 anyway.                                                       */
  /* The AO_MASK macro is a workaround for the fact that HP/UX gcc      */
  /* appears to otherwise store 64-bit pointers in ar.ccv, i.e. it      */
  /* doesn't appear to clear high bits in a pointer value we pass into  */
  /* assembly code, even if it is supposedly of type AO_t.              */
# define AO_IN_ADDR "1"(addr)
# define AO_OUT_ADDR , "=r"(addr)
# define AO_SWIZZLE "addp4 %1=0,%1;;\n"
# define AO_MASK(ptr) __asm__("zxt4 %1=%1": "=r"(ptr) : "0"(ptr));
#else
# define AO_LEN "8"
# define AO_IN_ADDR "r"(addr)
# define AO_OUT_ADDR
# define AO_SWIZZLE
# define AO_MASK(ptr)
#endif

AO_INLINE void
AO_nop_full(void)
{
  __asm__ __volatile__("mf" : : : "memory");
}
#define AO_HAVE_nop_full

AO_INLINE AO_t
AO_fetch_and_add1_acquire (volatile AO_t *addr)
{
  AO_t result;

  __asm__ __volatile__ (AO_SWIZZLE
                        "fetchadd" AO_LEN ".acq %0=[%1],1":
                        "=r" (result) AO_OUT_ADDR: AO_IN_ADDR :"memory");
  return result;
}
#define AO_HAVE_fetch_and_add1_acquire

AO_INLINE AO_t
AO_fetch_and_add1_release (volatile AO_t *addr)
{
  AO_t result;

  __asm__ __volatile__ (AO_SWIZZLE
                        "fetchadd" AO_LEN ".rel %0=[%1],1":
                        "=r" (result) AO_OUT_ADDR: AO_IN_ADDR :"memory");
  return result;
}

#define AO_HAVE_fetch_and_add1_release

AO_INLINE AO_t
AO_fetch_and_sub1_acquire (volatile AO_t *addr)
{
  AO_t result;

  __asm__ __volatile__ (AO_SWIZZLE
                        "fetchadd" AO_LEN ".acq %0=[%1],-1":
                        "=r" (result) AO_OUT_ADDR: AO_IN_ADDR :"memory");
  return result;
}

#define AO_HAVE_fetch_and_sub1_acquire

AO_INLINE AO_t
AO_fetch_and_sub1_release (volatile AO_t *addr)
{
  AO_t result;

  __asm__ __volatile__ (AO_SWIZZLE
                        "fetchadd" AO_LEN ".rel %0=[%1],-1":
                        "=r" (result) AO_OUT_ADDR: AO_IN_ADDR :"memory");
  return result;
}

#define AO_HAVE_fetch_and_sub1_release

#ifndef _ILP32

AO_INLINE unsigned int
AO_int_fetch_and_add1_acquire (volatile unsigned int *addr)
{
  unsigned int result;

  __asm__ __volatile__ ("fetchadd4.acq %0=[%1],1":
                        "=r" (result): AO_IN_ADDR :"memory");
  return result;
}
#define AO_HAVE_int_fetch_and_add1_acquire

AO_INLINE unsigned int
AO_int_fetch_and_add1_release (volatile unsigned int *addr)
{
  unsigned int result;

  __asm__ __volatile__ ("fetchadd4.rel %0=[%1],1":
                        "=r" (result): AO_IN_ADDR :"memory");
  return result;
}

#define AO_HAVE_int_fetch_and_add1_release

AO_INLINE unsigned int
AO_int_fetch_and_sub1_acquire (volatile unsigned int *addr)
{
  unsigned int result;

  __asm__ __volatile__ ("fetchadd4.acq %0=[%1],-1":
                        "=r" (result): AO_IN_ADDR :"memory");
  return result;
}

#define AO_HAVE_int_fetch_and_sub1_acquire

AO_INLINE unsigned int
AO_int_fetch_and_sub1_release (volatile unsigned int *addr)
{
  unsigned int result;

  __asm__ __volatile__ ("fetchadd4.rel %0=[%1],-1":
                        "=r" (result): AO_IN_ADDR :"memory");
  return result;
}

#define AO_HAVE_int_fetch_and_sub1_release

#endif /* !_ILP32 */

AO_INLINE int
AO_compare_and_swap_acquire(volatile AO_t *addr,
                             AO_t old, AO_t new_val)
{
  AO_t oldval;
  AO_MASK(old);
  __asm__ __volatile__(AO_SWIZZLE
                       "mov ar.ccv=%[old] ;; cmpxchg" AO_LEN
                       ".acq %0=[%1],%[new_val],ar.ccv"
                       : "=r"(oldval) AO_OUT_ADDR
                       : AO_IN_ADDR, [new_val]"r"(new_val), [old]"r"(old)
                       : "memory");
  return (oldval == old);
}

#define AO_HAVE_compare_and_swap_acquire

AO_INLINE int
AO_compare_and_swap_release(volatile AO_t *addr,
                             AO_t old, AO_t new_val)
{
  AO_t oldval;
  AO_MASK(old);
  __asm__ __volatile__(AO_SWIZZLE
                       "mov ar.ccv=%[old] ;; cmpxchg" AO_LEN
                       ".rel %0=[%1],%[new_val],ar.ccv"
                       : "=r"(oldval) AO_OUT_ADDR
                       : AO_IN_ADDR, [new_val]"r"(new_val), [old]"r"(old)
                       : "memory");
  return (oldval == old);
}

#define AO_HAVE_compare_and_swap_release

AO_INLINE int
AO_char_compare_and_swap_acquire(volatile unsigned char *addr,
                                 unsigned char old, unsigned char new_val)
{
  unsigned char oldval;
  __asm__ __volatile__(AO_SWIZZLE
               "mov ar.ccv=%[old] ;; cmpxchg1.acq %0=[%1],%[new_val],ar.ccv"
               : "=r"(oldval) AO_OUT_ADDR
               : AO_IN_ADDR, [new_val]"r"(new_val), [old]"r"((AO_t)old)
               : "memory");
  return (oldval == old);
}

#define AO_HAVE_char_compare_and_swap_acquire

AO_INLINE int
AO_char_compare_and_swap_release(volatile unsigned char *addr,
                                 unsigned char old, unsigned char new_val)
{
  unsigned char oldval;
  __asm__ __volatile__(AO_SWIZZLE
                "mov ar.ccv=%[old] ;; cmpxchg1.rel %0=[%1],%[new_val],ar.ccv"
                : "=r"(oldval) AO_OUT_ADDR
                : AO_IN_ADDR, [new_val]"r"(new_val), [old]"r"((AO_t)old)
                : "memory");
  return (oldval == old);
}

#define AO_HAVE_char_compare_and_swap_release

AO_INLINE int
AO_short_compare_and_swap_acquire(volatile unsigned short *addr,
                                  unsigned short old, unsigned short new_val)
{
  unsigned short oldval;
  __asm__ __volatile__(AO_SWIZZLE
                "mov ar.ccv=%[old] ;; cmpxchg2.acq %0=[%1],%[new_val],ar.ccv"
                : "=r"(oldval) AO_OUT_ADDR
                : AO_IN_ADDR, [new_val]"r"(new_val), [old]"r"((AO_t)old)
                : "memory");
  return (oldval == old);
}

#define AO_HAVE_short_compare_and_swap_acquire

AO_INLINE int
AO_short_compare_and_swap_release(volatile unsigned short *addr,
                                  unsigned short old, unsigned short new_val)
{
  unsigned short oldval;
  __asm__ __volatile__(AO_SWIZZLE
                "mov ar.ccv=%[old] ;; cmpxchg2.rel %0=[%1],%[new_val],ar.ccv"
                : "=r"(oldval) AO_OUT_ADDR
                : AO_IN_ADDR, [new_val]"r"(new_val), [old]"r"((AO_t)old)
                : "memory");
  return (oldval == old);
}

#define AO_HAVE_short_compare_and_swap_release

#ifndef _ILP32

AO_INLINE int
AO_int_compare_and_swap_acquire(volatile unsigned int *addr,
                                unsigned int old, unsigned int new_val)
{
  unsigned int oldval;
  __asm__ __volatile__("mov ar.ccv=%3 ;; cmpxchg4.acq %0=[%1],%2,ar.ccv"
                       : "=r"(oldval)
                       : AO_IN_ADDR, "r"(new_val), "r"((AO_t)old) : "memory");
  return (oldval == old);
}

#define AO_HAVE_int_compare_and_swap_acquire

AO_INLINE int
AO_int_compare_and_swap_release(volatile unsigned int *addr,
                                unsigned int old, unsigned int new_val)
{
  unsigned int oldval;
  __asm__ __volatile__("mov ar.ccv=%3 ;; cmpxchg4.rel %0=[%1],%2,ar.ccv"
                       : "=r"(oldval)
                       : AO_IN_ADDR, "r"(new_val), "r"((AO_t)old) : "memory");
  return (oldval == old);
}

#define AO_HAVE_int_compare_and_swap_release

#endif /* !_ILP32 */

/* FIXME: Add compare_and_swap_double as soon as there is widely        */
/* available hardware that implements it.                               */

/* FIXME: Add compare_double_and_swap_double for the _ILP32 case.       */

#ifdef _ILP32
# include "./ao_t_is_int.h"
#endif


================================================
FILE: stms/estm-0.3.0/src/atomic_ops/ordered_except_wr.h
================================================
/*
 * Copyright (c) 2003 by Hewlett-Packard Company.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

/*
 * These are common definitions for architectures that provide processor
 * ordered memory operations except that a later read may pass an
 * earlier write.  Real x86 implementations seem to be in this category,
 * except apparently for some IDT WinChips, which we ignore.
 */

#include "read_ordered.h"

AO_INLINE void
AO_nop_write(void)
{
  AO_compiler_barrier();
  /* sfence according to Intel docs.  Pentium 3 and up. */
  /* Unnecessary for cached accesses?                   */
}

#define AO_HAVE_NOP_WRITE

#if defined(AO_HAVE_store)

AO_INLINE void
AO_store_write(volatile AO_t *addr, AO_t val)
{
  AO_compiler_barrier();
  AO_store(addr, val);
}
# define AO_HAVE_store_write

# define AO_store_release(addr, val) AO_store_write(addr, val)
# define AO_HAVE_store_release

#endif /* AO_HAVE_store */

#if defined(AO_HAVE_char_store)

AO_INLINE void
AO_char_store_write(volatile unsigned char *addr, unsigned char val)
{
  AO_compiler_barrier();
  AO_char_store(addr, val);
}
# define AO_HAVE_char_store_write

# define AO_char_store_release(addr, val) AO_char_store_write(addr, val)
# define AO_HAVE_char_store_release

#endif /* AO_HAVE_char_store */

#if defined(AO_HAVE_short_store)

AO_INLINE void
AO_short_store_write(volatile unsigned short *addr, unsigned short val)
{
  AO_compiler_barrier();
  AO_short_store(addr, val);
}
# define AO_HAVE_short_store_write

# define AO_short_store_release(addr, val) AO_short_store_write(addr, val)
# define AO_HAVE_short_store_release

#endif /* AO_HAVE_short_store */

#if defined(AO_HAVE_int_store)

AO_INLINE void
AO_int_store_write(volatile unsigned int *addr, unsigned int val)
{
  AO_compiler_barrier();
  AO_int_store(addr, val);
}
# define AO_HAVE_int_store_write

# define AO_int_store_release(addr, val) AO_int_store_write(addr, val)
# define AO_HAVE_int_store_release

#endif /* AO_HAVE_int_store */


================================================
FILE: stms/estm-0.3.0/src/atomic_ops/powerpc.h
================================================
/*
 * Copyright (c) 1991-1994 by Xerox Corporation.  All rights reserved.
 * Copyright (c) 1996-1999 by Silicon Graphics.  All rights reserved.
 * Copyright (c) 1999-2004 Hewlett-Packard Development Company, L.P.
 *
 *
 * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
 * OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
 *
 * Permission is hereby granted to use or copy this program
 * for any purpose,  provided the above notices are retained on all copies.
 * Permission to modify the code and to distribute modified code is granted,
 * provided the above notices are retained, and a notice that the code was
 * modified is included with the above copyright notice.
 *
 */

/* Memory model documented at http://www-106.ibm.com/developerworks/    */
/* eserver/articles/archguide.html and (clearer)                        */
/* http://www-106.ibm.com/developerworks/eserver/articles/powerpc.html. */
/* There appears to be no implicit ordering between any kind of         */
/* independent memory references.                                       */
/* Architecture enforces some ordering based on control dependence.     */
/* I don't know if that could help.                                     */
/* Data-dependent loads are always ordered.                             */
/* Based on the above references, eieio is intended for use on          */
/* uncached memory, which we don't support.  It does not order loads    */
/* from cached memory.                                                  */
/* Thanks to Maged Michael, Doug Lea, and Roger Hoover for helping to   */
/* track some of this down and correcting my misunderstandings. -HB     */
/* Earl Chew subsequently contributed further fixes & additions.        */

#include "./aligned_atomic_load_store.h"

#include "./test_and_set_t_is_ao_t.h"
        /* There seems to be no byte equivalent of lwarx, so this       */
        /* may really be what we want, at least in the 32-bit case.     */

AO_INLINE void
AO_nop_full(void)
{
  __asm__ __volatile__("sync" : : : "memory");
}

#define AO_HAVE_nop_full

/* lwsync apparently works for everything but a StoreLoad barrier.      */
AO_INLINE void
AO_lwsync(void)
{
#ifdef __NO_LWSYNC__
  __asm__ __volatile__("sync" : : : "memory");
#else
  __asm__ __volatile__("lwsync" : : : "memory");
#endif
}

#define AO_nop_write() AO_lwsync()
#define AO_HAVE_nop_write

#define AO_nop_read() AO_lwsync()
#define AO_HAVE_nop_read

/* We explicitly specify load_acquire, since it is important, and can   */
/* be implemented relatively cheaply.  It could be implemented          */
/* with an ordinary load followed by a lwsync.  But the general wisdom  */
/* seems to be that a data dependent branch followed by an isync is     */
/* cheaper.  And the documentation is fairly explicit that this also    */
/* has acquire semantics.                                               */
/* ppc64 uses ld not lwz */
#if defined(__powerpc64__) || defined(__ppc64__) || defined(__64BIT__)
AO_INLINE AO_t
AO_load_acquire(const volatile AO_t *addr)
{
  AO_t result;

   __asm__ __volatile__ (
    "ld%U1%X1 %0,%1\n"
    "cmpw %0,%0\n"
    "bne- 1f\n"
    "1: isync\n"
    : "=r" (result)
    : "m"(*addr) : "memory", "cr0");
  return result;
}
#else
AO_INLINE AO_t
AO_load_acquire(const volatile AO_t *addr)
{
  AO_t result;

  /* FIXME: We should get gcc to allocate one of the condition  */
  /* registers.  I always got "impossible constraint" when I    */
  /* tried the "y" constraint.                                  */
  __asm__ __volatile__ (
    "lwz%U1%X1 %0,%1\n"
    "cmpw %0,%0\n"
    "bne- 1f\n"
    "1: isync\n"
    : "=r" (result)
    : "m"(*addr) : "memory", "cc");
  return result;
}
#endif
#define AO_HAVE_load_acquire

/* We explicitly specify store_release, since it relies         */
/* on the fact that lwsync is also a LoadStore barrier.         */
AO_INLINE void
AO_store_release(volatile AO_t *addr, AO_t value)
{
  AO_lwsync();
  *addr = value;
}

#define AO_HAVE_load_acquire

/* This is similar to the code in the garbage collector.  Deleting      */
/* this and having it synthesized from compare_and_swap would probably  */
/* only cost us a load immediate instruction.                           */
#if defined(__powerpc64__) || defined(__ppc64__) || defined(__64BIT__)
/* Completely untested.  And we should be using smaller objects anyway. */
AO_INLINE AO_TS_VAL_t
AO_test_and_set(volatile AO_TS_t *addr) {
  unsigned long oldval;
  unsigned long temp = 1; /* locked value */

  __asm__ __volatile__(
               "1:ldarx %0,0,%1\n"   /* load and reserve               */
               "cmpdi %0, 0\n"       /* if load is                     */
               "bne 2f\n"            /*   non-zero, return already set */
               "stdcx. %2,0,%1\n"    /* else store conditional         */
               "bne- 1b\n"           /* retry if lost reservation      */
               "2:\n"                /* oldval is zero if we set       */
              : "=&r"(oldval)
              : "r"(addr), "r"(temp)
              : "memory", "cr0");

  return (AO_TS_VAL_t)oldval;
}

#else

AO_INLINE AO_TS_VAL_t
AO_test_and_set(volatile AO_TS_t *addr) {
  int oldval;
  int temp = 1; /* locked value */

  __asm__ __volatile__(
               "1:lwarx %0,0,%1\n"   /* load and reserve               */
               "cmpwi %0, 0\n"       /* if load is                     */
               "bne 2f\n"            /*   non-zero, return already set */
               "stwcx. %2,0,%1\n"    /* else store conditional         */
               "bne- 1b\n"           /* retry if lost reservation      */
               "2:\n"                /* oldval is zero if we set       */
              : "=&r"(oldval)
              : "r"(addr), "r"(temp)
              : "memory", "cr0");

  return (AO_TS_VAL_t)oldval;
}

#endif

#define AO_HAVE_test_and_set

AO_INLINE AO_TS_VAL_t
AO_test_and_set_acquire(volatile AO_TS_t *addr) {
  AO_TS_VAL_t result = AO_test_and_set(addr);
  AO_lwsync();
  return result;
}

#define AO_HAVE_test_and_set_acquire

AO_INLINE AO_TS_VAL_t
AO_test_and_set_release(volatile AO_TS_t *addr) {
  AO_lwsync();
  return AO_test_and_set(addr);
}

#define AO_HAVE_test_and_set_release

AO_INLINE AO_TS_VAL_t
AO_test_and_set_full(volatile AO_TS_t *addr) {
  AO_TS_VAL_t result;
  AO_lwsync();
  result = AO_test_and_set(addr);
  AO_lwsync();
  return result;
}

#define AO_HAVE_test_and_set_full

#if defined(__powerpc64__) || defined(__ppc64__) || defined(__64BIT__)
/* FIXME: Completely untested.  */
AO_INLINE int
AO_compare_and_swap(volatile AO_t *addr, AO_t old, AO_t new_val) {
  AO_t oldval;
  int result = 0;

  __asm__ __volatile__(
               "1:ldarx %0,0,%2\n"   /* load and reserve              */
               "cmpd %0, %4\n"      /* if load is not equal to  */
               "bne 2f\n"            /*   old, fail                     */
               "stdcx. %3,0,%2\n"    /* else store conditional         */
               "bne- 1b\n"           /* retry if lost reservation      */
               "li %1,1\n"           /* result = 1;                     */
               "2:\n"
              : "=&r"(oldval), "=&r"(result)
              : "r"(addr), "r"(new_val), "r"(old), "1"(result)
              : "memory", "cr0");

  return result;
}

#else

AO_INLINE int
AO_compare_and_swap(volatile AO_t *addr, AO_t old, AO_t new_val) {
  AO_t oldval;
  int result = 0;

  __asm__ __volatile__(
               "1:lwarx %0,0,%2\n"   /* load and reserve              */
               "cmpw %0, %4\n"      /* if load is not equal to  */
               "bne 2f\n"            /*   old, fail                     */
               "stwcx. %3,0,%2\n"    /* else store conditional         */
               "bne- 1b\n"           /* retry if lost reservation      */
               "li %1,1\n"           /* result = 1;                     */
               "2:\n"
              : "=&r"(oldval), "=&r"(result)
              : "r"(addr), "r"(new_val), "r"(old), "1"(result)
              : "memory", "cr0");

  return result;
}
#endif

#define AO_HAVE_compare_and_swap

AO_INLINE int
AO_compare_and_swap_acquire(volatile AO_t *addr, AO_t old, AO_t new_val) {
  int result = AO_compare_and_swap(addr, old, new_val);
  AO_lwsync();
  return result;
}

#define AO_HAVE_compare_and_swap_acquire

AO_INLINE int
AO_compare_and_swap_release(volatile AO_t *addr, AO_t old, AO_t new_val) {
  AO_lwsync();
  return AO_compare_and_swap(addr, old, new_val);
}

#define AO_HAVE_compare_and_swap_release

AO_INLINE int
AO_compare_and_swap_full(volatile AO_t *addr, AO_t old, AO_t new_val) {
  AO_t result;
  AO_lwsync();
  result = AO_compare_and_swap(addr, old, new_val);
  AO_lwsync();
  return result;
}

#define AO_HAVE_compare_and_swap_full

#if defined(__powerpc64__) || defined(__ppc64__) || defined(__64BIT__)
/* FIXME: Completely untested.                                          */

AO_INLINE AO_t
AO_fetch_and_add(volatile AO_t *addr, AO_t incr) {
  AO_t oldval;
  AO_t newval;

  __asm__ __volatile__(
               "1:ldarx %0,0,%2\n"   /* load and reserve                */
               "add %1,%0,%3\n"      /* increment                       */
               "stdcx. %1,0,%2\n"    /* store conditional               */
               "bne- 1b\n"           /* retry if lost reservation       */
              : "=&r"(oldval), "=&r"(newval)
               : "r"(addr), "r"(incr)
              : "memory", "cr0");

  return oldval;
}

#define AO_HAVE_fetch_and_add

#else

AO_INLINE AO_t
AO_fetch_and_add(volatile AO_t *addr, AO_t incr) {
  AO_t oldval;
  AO_t newval;

  __asm__ __volatile__(
               "1:lwarx %0,0,%2\n"   /* load and reserve                */
               "add %1,%0,%3\n"      /* increment                       */
               "stwcx. %1,0,%2\n"    /* store conditional               */
               "bne- 1b\n"           /* retry if lost reservation       */
              : "=&r"(oldval), "=&r"(newval)
               : "r"(addr), "r"(incr)
              : "memory", "cr0");

  return oldval;
}

#define AO_HAVE_fetch_and_add

#endif

AO_INLINE AO_t
AO_fetch_and_add_acquire(volatile AO_t *addr, AO_t incr) {
  AO_t result = AO_fetch_and_add(addr, incr);
  AO_lwsync();
  return result;
}

#define AO_HAVE_fetch_and_add_acquire

AO_INLINE AO_t
AO_fetch_and_add_release(volatile AO_t *addr, AO_t incr) {
  AO_lwsync();
  return AO_fetch_and_add(addr, incr);
}

#define AO_HAVE_fetch_and_add_release

AO_INLINE AO_t
AO_fetch_and_add_full(volatile AO_t *addr, AO_t incr) {
  AO_t result;
  AO_lwsync();
  result = AO_fetch_and_add(addr, incr);
  AO_lwsync();
  return result;
}

#define AO_HAVE_fetch_and_add_full

#if defined(__powerpc64__) || defined(__ppc64__) || defined(__64BIT__)
#else
# include "./ao_t_is_int.h"
#endif


================================================
FILE: stms/estm-0.3.0/src/atomic_ops/read_ordered.h
================================================
/*
 * Copyright (c) 2003 by Hewlett-Packard Company.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

/*
 * These are common definitions for architectures that provide processor
 * ordered memory operations except that a later read may pass an
 * earlier write.  Real x86 implementations seem to be in this category,
 * except apparently for some IDT WinChips, which we ignore.
 */

AO_INLINE void
AO_nop_read(void)
{
  AO_compiler_barrier();
}

#define AO_HAVE_NOP_READ

#ifdef AO_HAVE_load

AO_INLINE AO_t
AO_load_read(const volatile AO_t *addr)
{
  AO_t result = AO_load(addr);
  AO_compiler_barrier();
  return result;
}
#define AO_HAVE_load_read

#define AO_load_acquire(addr) AO_load_read(addr)
#define AO_HAVE_load_acquire

#endif /* AO_HAVE_load */

#ifdef AO_HAVE_char_load

AO_INLINE AO_t
AO_char_load_read(const volatile unsigned char *addr)
{
  AO_t result = AO_char_load(addr);
  AO_compiler_barrier();
  return result;
}
#define AO_HAVE_char_load_read

#define AO_char_load_acquire(addr) AO_char_load_read(addr)
#define AO_HAVE_char_load_acquire

#endif /* AO_HAVE_char_load */

#ifdef AO_HAVE_short_load

AO_INLINE AO_t
AO_short_load_read(const volatile unsigned short *addr)
{
  AO_t result = AO_short_load(addr);
  AO_compiler_barrier();
  return result;
}
#define AO_HAVE_short_load_read

#define AO_short_load_acquire(addr) AO_short_load_read(addr)
#define AO_HAVE_short_load_acquire

#endif /* AO_HAVE_short_load */

#ifdef AO_HAVE_int_load

AO_INLINE AO_t
AO_int_load_read(const volatile unsigned int *addr)
{
  AO_t result = AO_int_load(addr);
  AO_compiler_barrier();
  return result;
}
#define AO_HAVE_int_load_read

#define AO_int_load_acquire(addr) AO_int_load_read(addr)
#define AO_HAVE_int_load_acquire

#endif /* AO_HAVE_int_load */


================================================
FILE: stms/estm-0.3.0/src/atomic_ops/sparc.h
================================================
/*
 * Copyright (c) 1991-1994 by Xerox Corporation.  All rights reserved.
 * Copyright (c) 1996-1999 by Silicon Graphics.  All rights reserved.
 * Copyright (c) 1999-2003 by Hewlett-Packard Company. All rights reserved.
 *
 *
 * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
 * OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
 *
 * Permission is hereby granted to use or copy this program
 * for any purpose,  provided the above notices are retained on all copies.
 * Permission to modify the code and to distribute modified code is granted,
 * provided the above notices are retained, and a notice that the code was
 * modified is included with the above copyright notice.
 *
 */

/* FIXME.  Very incomplete.  No support for sparc64.    */
/* Non-ancient SPARCs provide compare-and-swap (casa).  */
/* We should make that available.                       */

#include "./aligned_atomic_load_store.h"

/* Real SPARC code uses TSO:                            */
#include "./ordered_except_wr.h"

/* Test_and_set location is just a byte.                */
#include "./test_and_set_t_is_char.h"

AO_INLINE AO_TS_VAL_t
AO_test_and_set_full(volatile AO_TS_t *addr) {
   AO_TS_VAL_t oldval;

   __asm__ __volatile__("ldstub %1,%0"
                        : "=r"(oldval), "=m"(*addr)
                        : "m"(*addr) : "memory");
   return oldval;
}

#define AO_HAVE_test_and_set_full

#ifndef AO_NO_SPARC_V9
/* Returns nonzero if the comparison succeeded. */
AO_INLINE int
AO_compare_and_swap_full(volatile AO_t *addr, AO_t old, AO_t new_val) {
  char ret;
  __asm__ __volatile__ ("membar #StoreLoad | #LoadLoad\n\t"
#                       if defined(__arch64__)
                          "casx [%2],%0,%1\n\t"
#                       else
                          "cas [%2],%0,%1\n\t" /* 32-bit version */
#                       endif
                        "membar #StoreLoad | #StoreStore\n\t"
                        "cmp %0,%1\n\t"
                        "be,a 0f\n\t"
                        "mov 1,%0\n\t"/* one insn after branch always executed */
                        "clr %0\n\t"
                        "0:\n\t"
                        : "=r" (ret), "+r" (new_val)
                        : "r" (addr), "0" (old)
                        : "memory", "cc");
  return (int)ret;
}

#define AO_HAVE_compare_and_swap_full
#endif /* AO_NO_SPARC_V9 */

/* FIXME: This needs to be extended for SPARC v8 and v9.        */
/* SPARC V8 also has swap.  V9 has CAS.                         */
/* There are barriers like membar #LoadStore.                   */
/* CASA (32-bit) and CASXA(64-bit) instructions were            */
/* added in V9.                                                 */


================================================
FILE: stms/estm-0.3.0/src/atomic_ops/standard_ao_double_t.h
================================================
/* NEC LE-IT: For 64Bit OS we extend the double type to hold two int64's
*
*  x86-64: __m128 serves as placeholder which also requires the compiler
*          to align     it on 16 byte boundary (as required by cmpxchg16.
* Similar things could be done for PowerPC 64bit using a VMX data type...       */

#if (defined(__x86_64__) && defined(__GNUC__)) || defined(_WIN64)
# include <xmmintrin.h>
  typedef __m128 double_ptr_storage;
#elif defined(_WIN32) && !defined(__GNUC__)
  typedef unsigned __int64 double_ptr_storage;
#else
  typedef unsigned long long double_ptr_storage;
#endif

# define AO_HAVE_DOUBLE_PTR_STORAGE

typedef union {
    double_ptr_storage AO_whole;
    struct {AO_t AO_v1; AO_t AO_v2;} AO_parts;
} AO_double_t;

#define AO_HAVE_double_t
#define AO_val1 AO_parts.AO_v1
#define AO_val2 AO_parts.AO_v2


================================================
FILE: stms/estm-0.3.0/src/atomic_ops/test_and_set_t_is_ao_t.h
================================================
/*
 * Copyright (c) 2004 Hewlett-Packard Development Company, L.P.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

/*
 * These are common definitions for architectures on which test_and_set
 * operates on pointer-sized quantities, the "clear" value contains
 * all zeroes, and the "set" value contains only one lowest bit set.
 * This can be used if test_and_set is synthesized from compare_and_swap.
 */
typedef enum {AO_TS_clear = 0, AO_TS_set = 1} AO_TS_val;
#define AO_TS_VAL_t AO_TS_val
#define AO_TS_CLEAR AO_TS_clear
#define AO_TS_SET AO_TS_set

#define AO_TS_t AO_t

#define AO_AO_TS_T 1


================================================
FILE: stms/estm-0.3.0/src/atomic_ops/test_and_set_t_is_char.h
================================================
/*
 * Copyright (c) 2004 Hewlett-Packard Development Company, L.P.
 * 
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE. 
 */ 

/*
 * These are common definitions for architectures on which test_and_set
 * operates on byte sized quantities, the "clear" value contains
 * all zeroes, and the "set" value contains all ones.
 */

#define AO_TS_t unsigned char
typedef enum {AO_BYTE_TS_clear = 0, AO_BYTE_TS_set = 0xff} AO_BYTE_TS_val;
#define AO_TS_VAL_t AO_BYTE_TS_val
#define AO_TS_CLEAR AO_BYTE_TS_clear
#define AO_TS_SET AO_BYTE_TS_set

#define AO_CHAR_TS_T 1


================================================
FILE: stms/estm-0.3.0/src/atomic_ops/x86.h
================================================
/*
 * Copyright (c) 1991-1994 by Xerox Corporation.  All rights reserved.
 * Copyright (c) 1996-1999 by Silicon Graphics.  All rights reserved.
 * Copyright (c) 1999-2003 by Hewlett-Packard Company. All rights reserved.
 *
 *
 * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
 * OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
 *
 * Permission is hereby granted to use or copy this program
 * for any purpose,  provided the above notices are retained on all copies.
 * Permission to modify the code and to distribute modified code is granted,
 * provided the above notices are retained, and a notice that the code was
 * modified is included with the above copyright notice.
 *
 * Some of the machine specific code was borrowed from our GC distribution.
 */

/* The following really assume we have a 486 or better.  Unfortunately  */
/* gcc doesn't define a suitable feature test macro based on command    */
/* line options.                                                        */
/* We should perhaps test dynamically.                                  */

#include "./aligned_atomic_load_store.h"

/* Real X86 implementations, except for some old WinChips, appear       */
/* to enforce ordering between memory operations, EXCEPT that a later   */
/* read can pass earlier writes, presumably due to the visible          */
/* presence of store buffers.                                           */
/* We ignore both the WinChips, and the fact that the official specs    */
/* seem to be much weaker (and arguably too weak to be usable).         */

#include "./ordered_except_wr.h"

#include "./test_and_set_t_is_char.h"

#include "./standard_ao_double_t.h"

#if defined(AO_USE_PENTIUM4_INSTRS)
AO_INLINE void
AO_nop_full(void)
{
  __asm__ __volatile__("mfence" : : : "memory");
}

#define AO_HAVE_nop_full

#else

/* We could use the cpuid instruction.  But that seems to be slower     */
/* than the default implementation based on test_and_set_full.  Thus    */
/* we omit that bit of misinformation here.                             */

#endif

/* As far as we can tell, the lfence and sfence instructions are not    */
/* currently needed or useful for cached memory accesses.               */

/* Really only works for 486 and later */
AO_INLINE AO_t
AO_fetch_and_add_full (volatile AO_t *p, AO_t incr)
{
  AO_t result;

  __asm__ __volatile__ ("lock; xaddl %0, %1" :
                        "=r" (result), "=m" (*p) : "0" (incr), "m" (*p)
                        : "memory");
  return result;
}

#define AO_HAVE_fetch_and_add_full

AO_INLINE unsigned char
AO_char_fetch_and_add_full (volatile unsigned char *p, unsigned char incr)
{
  unsigned char result;

  __asm__ __volatile__ ("lock; xaddb %0, %1" :
                        "=q" (result), "=m" (*p) : "0" (incr), "m" (*p)
                        : "memory");
  return result;
}

#define AO_HAVE_char_fetch_and_add_full

AO_INLINE unsigned short
AO_short_fetch_and_add_full (volatile unsigned short *p, unsigned short incr)
{
  unsigned short result;

  __asm__ __volatile__ ("lock; xaddw %0, %1" :
                        "=r" (result), "=m" (*p) : "0" (incr), "m" (*p)
                        : "memory");
  return result;
}

#define AO_HAVE_short_fetch_and_add_full

/* Really only works for 486 and later */
AO_INLINE void
AO_or_full (volatile AO_t *p, AO_t incr)
{
  __asm__ __volatile__ ("lock; orl %1, %0" :
                        "=m" (*p) : "r" (incr), "m" (*p) : "memory");
}

#define AO_HAVE_or_full

AO_INLINE AO_TS_VAL_t
AO_test_and_set_full(volatile AO_TS_t *addr)
{
  unsigned char oldval;
  /* Note: the "xchg" instruction does not need a "lock" prefix */
  __asm__ __volatile__("xchgb %0, %1"
                : "=q"(oldval), "=m"(*addr)
                : "0"(0xff), "m"(*addr) : "memory");
  return (AO_TS_VAL_t)oldval;
}

#define AO_HAVE_test_and_set_full

/* Returns nonzero if the comparison succeeded. */
AO_INLINE int
AO_compare_and_swap_full(volatile AO_t *addr, AO_t old, AO_t new_val)
{
# ifdef AO_USE_SYNC_CAS_BUILTIN
    return (int)__sync_bool_compare_and_swap(addr, old, new_val);
# else
    char result;
    __asm__ __volatile__("lock; cmpxchgl %3, %0; setz %1"
                         : "=m" (*addr), "=a" (result)
                         : "m" (*addr), "r" (new_val), "a" (old) : "memory");
    return (int)result;
# endif
}

#define AO_HAVE_compare_and_swap_full

/* Returns nonzero if the comparison succeeded. */
/* Really requires at least a Pentium.          */
AO_INLINE int
AO_compare_double_and_swap_double_full(volatile AO_double_t *addr,
                                       AO_t old_val1, AO_t old_val2,
                                       AO_t new_val1, AO_t new_val2)
{
  char result;
#if __PIC__
  /* If PIC is turned on, we can't use %ebx as it is reserved for the
     GOT pointer.  We can save and restore %ebx because GCC won't be
     using it for anything else (such as any of the m operands) */
  __asm__ __volatile__("pushl %%ebx;"   /* save ebx used for PIC GOT ptr */
                       "movl %6,%%ebx;" /* move new_val2 to %ebx */
                       "lock; cmpxchg8b %0; setz %1;"
                       "pop %%ebx;"     /* restore %ebx */
                       : "=m"(*addr), "=a"(result)
                       : "m"(*addr), "d" (old_val2), "a" (old_val1),
                         "c" (new_val2), "m" (new_val1) : "memory");
#else
  /* We can't just do the same thing in non-PIC mode, because GCC
   * might be using %ebx as the memory operand.  We could have ifdef'd
   * in a clobber, but there's no point doing the push/pop if we don't
   * have to. */
  __asm__ __volatile__("lock; cmpxchg8b %0; setz %1;"
                       : "=m"(*addr), "=a"(result)
                       : "m"(*addr), "d" (old_val2), "a" (old_val1),
                         "c" (new_val2), "b" (new_val1) : "memory");
#endif
  return (int) result;
}

#define AO_HAVE_compare_double_and_swap_double_full

#include "./ao_t_is_int.h"


================================================
FILE: stms/estm-0.3.0/src/atomic_ops/x86_64.h
================================================
/*
 * Copyright (c) 1991-1994 by Xerox Corporation.  All rights reserved.
 * Copyright (c) 1996-1999 by Silicon Graphics.  All rights reserved.
 * Copyright (c) 1999-2003 by Hewlett-Packard Company. All rights reserved.
 *
 *
 * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
 * OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
 *
 * Permission is hereby granted to use or copy this program
 * for any purpose,  provided the above notices are retained on all copies.
 * Permission to modify the code and to distribute modified code is granted,
 * provided the above notices are retained, and a notice that the code was
 * modified is included with the above copyright notice.
 *
 * Some of the machine specific code was borrowed from our GC distribution.
 */

#include "./aligned_atomic_load_store.h"

/* Real X86 implementations appear                                      */
/* to enforce ordering between memory operations, EXCEPT that a later   */
/* read can pass earlier writes, presumably due to the visible          */
/* presence of store buffers.                                           */
/* We ignore the fact that the official specs                           */
/* seem to be much weaker (and arguably too weak to be usable).         */

#include "./ordered_except_wr.h"

#include "./test_and_set_t_is_char.h"

#include "./standard_ao_double_t.h"

AO_INLINE void
AO_nop_full(void)
{
  /* Note: "mfence" (SSE2) is supported on all x86_64/amd64 chips.      */
  __asm__ __volatile__("mfence" : : : "memory");
}

#define AO_HAVE_nop_full

/* As far as we can tell, the lfence and sfence instructions are not    */
/* currently needed or useful for cached memory accesses.               */

AO_INLINE AO_t
AO_fetch_and_add_full (volatile AO_t *p, AO_t incr)
{
  AO_t result;

  __asm__ __volatile__ ("lock; xaddq %0, %1" :
                        "=r" (result), "=m" (*p) : "0" (incr), "m" (*p)
                        : "memory");
  return result;
}

#define AO_HAVE_fetch_and_add_full

AO_INLINE unsigned char
AO_char_fetch_and_add_full (volatile unsigned char *p, unsigned char incr)
{
  unsigned char result;

  __asm__ __volatile__ ("lock; xaddb %0, %1" :
                        "=q" (result), "=m" (*p) : "0" (incr), "m" (*p)
                        : "memory");
  return result;
}

#define AO_HAVE_char_fetch_and_add_full

AO_INLINE unsigned short
AO_short_fetch_and_add_full (volatile unsigned short *p, unsigned short incr)
{
  unsigned short result;

  __asm__ __volatile__ ("lock; xaddw %0, %1" :
                        "=r" (result), "=m" (*p) : "0" (incr), "m" (*p)
                        : "memory");
  return result;
}

#define AO_HAVE_short_fetch_and_add_full

AO_INLINE unsigned int
AO_int_fetch_and_add_full (volatile unsigned int *p, unsigned int incr)
{
  unsigned int result;

  __asm__ __volatile__ ("lock; xaddl %0, %1" :
                        "=r" (result), "=m" (*p) : "0" (incr), "m" (*p)
                        : "memory");
  return result;
}

#define AO_HAVE_int_fetch_and_add_full

AO_INLINE void
AO_or_full (volatile AO_t *p, AO_t incr)
{
  __asm__ __volatile__ ("lock; orq %1, %0" :
                        "=m" (*p) : "r" (incr), "m" (*p) : "memory");
}

#define AO_HAVE_or_full

AO_INLINE AO_TS_VAL_t
AO_test_and_set_full(volatile AO_TS_t *addr)
{
  unsigned char oldval;
  /* Note: the "xchg" instruction does not need a "lock" prefix */
  __asm__ __volatile__("xchgb %0, %1"
                : "=q"(oldval), "=m"(*addr)
                : "0"(0xff), "m"(*addr) : "memory");
  return (AO_TS_VAL_t)oldval;
}

#define AO_HAVE_test_and_set_full

/* Returns nonzero if the comparison succeeded. */
AO_INLINE int
AO_compare_and_swap_full(volatile AO_t *addr, AO_t old, AO_t new_val)
{
# ifdef AO_USE_SYNC_CAS_BUILTIN
    return (int)__sync_bool_compare_and_swap(addr, old, new_val);
# else
    char result;
    __asm__ __volatile__("lock; cmpxchgq %3, %0; setz %1"
                         : "=m" (*addr), "=a" (result)
                         : "m" (*addr), "r" (new_val), "a" (old) : "memory");
    return (int) result;
# endif
}

#define AO_HAVE_compare_and_swap_full

#ifdef AO_CMPXCHG16B_AVAILABLE
/* NEC LE-IT: older AMD Opterons are missing this instruction.
 * On these machines SIGILL will be thrown.
 * Define AO_WEAK_DOUBLE_CAS_EMULATION to have an emulated
 * (lock based) version available */
/* HB: Changed this to not define either by default.  There are
 * enough machines and tool chains around on which cmpxchg16b
 * doesn't work.  And the emulation is unsafe by our usual rules.
 * Hoewever both are clearly useful in certain cases.
 */
AO_INLINE int
AO_compare_double_and_swap_double_full(volatile AO_double_t *addr,
                                       AO_t old_val1, AO_t old_val2,
                                       AO_t new_val1, AO_t new_val2)
{
  char result;
  __asm__ __volatile__("lock; cmpxchg16b %0; setz %1"
                       : "=m"(*addr), "=a"(result)
                       : "m"(*addr), "d" (old_val2), "a" (old_val1),
                         "c" (new_val2), "b" (new_val1) : "memory");
  return (int) result;
}
#define AO_HAVE_compare_double_and_swap_double_full
#else
/* this one provides spinlock based emulation of CAS implemented in     */
/* atomic_ops.c.  We probably do not want to do this here, since it is  */
/* not atomic with respect to other kinds of updates of *addr.  On the  */
/* other hand, this may be a useful facility on occasion.               */
#ifdef AO_WEAK_DOUBLE_CAS_EMULATION
int AO_compare_double_and_swap_double_emulation(volatile AO_double_t *addr,
                                                AO_t old_val1, AO_t old_val2,
                                                AO_t new_val1, AO_t new_val2);

AO_INLINE int
AO_compare_double_and_swap_double_full(volatile AO_double_t *addr,
                                       AO_t old_val1, AO_t old_val2,
                                       AO_t new_val1, AO_t new_val2)
{
        return AO_compare_double_and_swap_double_emulation(addr,
                                                           old_val1, old_val2,
                                                           new_val1, new_val2);
}
#define AO_HAVE_compare_double_and_swap_double_full
#endif /* AO_WEAK_DOUBLE_CAS_EMULATION */
#endif /* AO_CMPXCHG16B_AVAILABLE */


================================================
FILE: stms/estm-0.3.0/src/gc.c
================================================
/*
 * File:
 *   gc.c
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * Description:
 *   Epoch-based garbage collector.
 *
 * Copyright (c) 2007-2009.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */

#include <assert.h>
#include <stdio.h>
#include <stdint.h>

#include <pthread.h>

#include "gc.h"

#include "atomic.h"
#include "stm.h"

/* TODO: could be made much more efficient by allocating large chunks. */

/* ################################################################### *
 * DEFINES
 * ################################################################### */

#define MAX_THREADS                     1024
#define EPOCH_MAX                       (~(gc_word_t)0)

#ifndef NO_PERIODIC_CLEANUP
# ifndef CLEANUP_FREQUENCY
#  define CLEANUP_FREQUENCY             1
# endif /* ! CLEANUP_FREQUENCY */
#endif /* ! NO_PERIODIC_CLEANUP */

#ifdef DEBUG
/* Note: stdio is thread-safe */
# define IO_FLUSH                       fflush(NULL)
# define PRINT_DEBUG(...)               printf(__VA_ARGS__); fflush(NULL)
#else /* ! DEBUG */
# define IO_FLUSH
# define PRINT_DEBUG(...)
#endif /* ! DEBUG */

/* ################################################################### *
 * TYPES
 * ################################################################### */

enum {                                  /* Descriptor status */
  GC_NULL = 0,
  GC_BUSY = 1,
  GC_FREE_EMPTY = 2,
  GC_FREE_FULL = 3
};

typedef struct mem_block {              /* Block of allocated memory */
  void *addr;                           /* Address of memory */
  struct mem_block *next;               /* Next block */
} mem_block_t;

typedef struct mem_region {             /* A list of allocated memory blocks */
  struct mem_block *blocks;             /* Memory blocks */
  gc_word_t ts;                         /* Allocation timestamp */
  struct mem_region *next;              /* Next region */
} mem_region_t;

typedef struct tm_thread {              /* Descriptor of an active thread */
  gc_word_t used;                       /* Is this entry used? */
  pthread_t thread;                     /* Thread descriptor */
  gc_word_t ts;                         /* Start timestamp */
  mem_region_t *head;                   /* First memory region(s) assigned to thread */
  mem_region_t *tail;                   /* Last memory region(s) assigned to thread */
#ifndef NO_PERIODIC_CLEANUP
  unsigned int frees;                   /* How many blocks have been freed? */
#endif /* ! NO_PERIODIC_CLEANUP */
} tm_thread_t;

static volatile tm_thread_t *threads;   /* Array of active threads */
static volatile gc_word_t nb_threads;   /* Number of active threads */

static gc_word_t (*current_epoch)();    /* Read the value of the current epoch */

#ifdef TLS
static __thread int thread_idx;
#else /* ! TLS */
static pthread_key_t thread_idx;
#endif /* ! TLS */

/* ################################################################### *
 * STATIC
 * ################################################################### */

/*
 * Returns the index of the CURRENT thread.
 */
static inline int gc_get_idx()
{
#ifdef TLS
  return thread_idx;
#else /* ! TLS */
  return (int)pthread_getspecific(thread_idx);
#endif /* ! TLS */
}

/*
 * Compute a lower bound on the minimum start time of all active transactions.
 */
static inline gc_word_t gc_compute_min(gc_word_t now)
{
  int i;
  gc_word_t min, ts;
  stm_word_t used;

  PRINT_DEBUG("==> gc_compute_min(%d)\n", gc_get_idx());

  min = now;
  for (i = 0; i < MAX_THREADS; i++) {
    used = (gc_word_t)ATOMIC_LOAD(&threads[i].used);
    if (used == GC_NULL)
      break;
    if (used != GC_BUSY)
      continue;
    /* Used entry */
    ts = (gc_word_t)ATOMIC_LOAD(&threads[i].ts);
    if (ts < min)
      min = ts;
  }

  PRINT_DEBUG("==> gc_compute_min(%d,m=%lu)\n", gc_get_idx(), (unsigned long)min);

  return min;
}

/*
 * Free block list.
 */
static inline void gc_clean_blocks(mem_block_t *mb)
{
  mem_block_t *next_mb;

  while (mb != NULL) {
    PRINT_DEBUG("==> free(%d,a=%p)\n", gc_get_idx(), mb->addr);
    free(mb->addr);
    next_mb = mb->next;
    free(mb);
    mb = next_mb;
  }
}

/*
 * Free region list.
 */
static inline void gc_clean_regions(mem_region_t *mr)
{
  mem_region_t *next_mr;

  while (mr != NULL) {
    gc_clean_blocks(mr->blocks);
    next_mr = mr->next;
    free(mr);
    mr = next_mr;
  }
}

/*
 * Garbage-collect old data associated with a thread.
 */
void gc_cleanup_thread(int idx, gc_word_t min)
{
  mem_region_t *mr;

  PRINT_DEBUG("==> gc_cleanup_thread(%d,m=%lu)\n", idx, (unsigned long)min);

  if (threads[idx].head == NULL) {
    /* Nothing to clean up */
    return;
  }

  while (min > threads[idx].head->ts) {
    gc_clean_blocks(threads[idx].head->blocks);
    mr = threads[idx].head->next;
    free(threads[idx].head);
    threads[idx].head = mr;
    if(mr == NULL) {
      /* All memory regions deleted */
      threads[idx].tail = NULL;
      break;
    }
  }
}

/* ################################################################### *
 * FUNCTIONS
 * ################################################################### */

/*
 * Initialize GC library (to be called from main thread).
 */
void gc_init(gc_word_t (*epoch)())
{
  int i;

  PRINT_DEBUG("==> gc_init()\n");

  current_epoch = epoch;
  if ((threads = (tm_thread_t *)malloc(MAX_THREADS * sizeof(tm_thread_t))) == NULL) {
    perror("malloc");
    exit(1);
  }
  for (i = 0; i < MAX_THREADS; i++) {
    threads[i].used = GC_NULL;
    threads[i].ts = EPOCH_MAX;
    threads[i].head = threads[i].tail = NULL;
#ifndef NO_PERIODIC_CLEANUP
    threads[i].frees = 0;
#endif /* ! NO_PERIODIC_CLEANUP */
  }
  nb_threads = 0;
#ifndef TLS
  if (pthread_key_create(&thread_idx, NULL) != 0) {
    fprintf(stderr, "Error creating thread local\n");
    exit(1);
  }
#endif /* ! TLS */
}

/*
 * Clean up GC library (to be called from main thread).
 */
void gc_exit()
{
  int i;

  PRINT_DEBUG("==> gc_exit()\n");

  /* Make sure that all threads have been stopped */
  if (ATOMIC_LOAD(&nb_threads) != 0) {
    fprintf(stderr, "Error: some threads have not been cleaned up\n");
    exit(1);
  }
  /* Clean up memory */
  for (i = 0; i < MAX_THREADS; i++)
    gc_clean_regions(threads[i].head);

  free((void *)threads);
}

/*
 * Initialize thread-specific GC resources (to be called once by each thread).
 */
void gc_init_thread()
{
  int i, idx;
  gc_word_t used;

  PRINT_DEBUG("==> gc_init_thread()\n");

  if (ATOMIC_FETCH_INC_FULL(&nb_threads) >= MAX_THREADS) {
    fprintf(stderr, "Error: too many concurrent threads created\n");
    exit(1);
  }
  /* Find entry in threads array */
  i = 0;
  /* TODO: not wait-free */
  while (1) {
    used = (gc_word_t)ATOMIC_LOAD(&threads[i].used);
    if (used != GC_BUSY) {
      if (ATOMIC_CAS_FULL(&threads[i].used, used, GC_BUSY) != 0) {
        idx = i;
        /* Sets lower bound to current time (transactions by this thread cannot happen before) */
        ATOMIC_STORE(&threads[idx].ts, current_epoch());
        break;
      }
      /* CAS failed: anotehr thread must have acquired slot */
      assert (threads[i].used != GC_NULL);
    }
    if (++i >= MAX_THREADS)
      i = 0;
  }
#ifdef TLS
  thread_idx = idx;
#else /* ! TLS */
  pthread_setspecific(thread_idx, (void *)idx);
#endif /* ! TLS */

  PRINT_DEBUG("==> gc_init_thread(i=%d)\n", idx);
}

/*
 * Clean up thread-specific GC resources (to be called once by each thread).
 */
void gc_exit_thread()
{
  int idx = gc_get_idx();

  PRINT_DEBUG("==> gc_exit_thread(%d)\n", idx);

  /* No more lower bound for this thread */
  ATOMIC_STORE(&threads[idx].ts, EPOCH_MAX);
  /* Release slot */
  ATOMIC_STORE(&threads[idx].used, threads[idx].head == NULL ? GC_FREE_EMPTY : GC_FREE_FULL);
  ATOMIC_FETCH_DEC_FULL(&nb_threads);
  /* Leave memory for next thread to cleanup */
}

/*
 * Set new epoch (to be called by each thread, typically when starting
 * new transactions to indicate their start timestamp).
 */
void gc_set_epoch(gc_word_t epoch)
{
  int idx = gc_get_idx();

  PRINT_DEBUG("==> gc_set_epoch(%d,%lu)\n", idx, (unsigned long)epoch);

  if (epoch >= EPOCH_MAX) {
    fprintf(stderr, "Exceeded maximum epoch number: 0x%lx\n", (unsigned long)epoch);
    /* Do nothing (will prevent data from being garbage collected) */
    return;
  }

  /* Do not need a barrier as we only compute lower bounds */
  ATOMIC_STORE(&threads[idx].ts, epoch);
}

/*
 * Free memory (the thread must indicate the current timestamp).
 */
void gc_free(void *addr, gc_word_t epoch)
{
  mem_region_t *mr;
  mem_block_t *mb;
  int idx = gc_get_idx();

  PRINT_DEBUG("==> gc_free(%d,%lu)\n", idx, (unsigned long)epoch);

  if (threads[idx].head == NULL || threads[idx].head->ts < epoch) {
    /* Allocate a new region */
    if ((mr = (mem_region_t *)malloc(sizeof(mem_region_t))) == NULL) {
      perror("malloc");
      exit(1);
    }
    mr->ts = epoch;
    mr->blocks = NULL;
    mr->next = NULL;
    if (threads[idx].head == NULL) {
      threads[idx].head = threads[idx].tail = mr;
    } else {
      threads[idx].tail->next = mr;
      threads[idx].tail = mr;
    }
  } else {
    /* Add to current region */
    mr = threads[idx].head;
  }

  /* Allocate block */
  if ((mb = (mem_block_t *)malloc(sizeof(mem_block_t))) == NULL) {
    perror("malloc");
    exit(1);
  }
  mb->addr = addr;
  mb->next = mr->blocks;
  mr->blocks = mb;

#ifndef NO_PERIODIC_CLEANUP
  threads[idx].frees++;
  if (threads[idx].frees % CLEANUP_FREQUENCY == 0)
    gc_cleanup();
#endif /* ! NO_PERIODIC_CLEANUP */
}

/*
 * Garbage-collect old data associated with the current thread (should
 * be called periodically).
 */
void gc_cleanup()
{
  gc_word_t min;
  int idx = gc_get_idx();

  PRINT_DEBUG("==> gc_cleanup(%d)\n", idx);

  if (threads[idx].head == NULL) {
    /* Nothing to clean up */
    return;
  }

  min = gc_compute_min(current_epoch());

  gc_cleanup_thread(idx, min);
}

/*
 * Garbage-collect old data associated with all threads (should be
 * called periodically).
 */
void gc_cleanup_all()
{
  int i;
  gc_word_t min = EPOCH_MAX;

  PRINT_DEBUG("==> gc_cleanup_all()\n");

  for (i = 0; i < MAX_THREADS; i++) {
    if ((gc_word_t)ATOMIC_LOAD(&threads[i].used) == GC_NULL)
      break;
    if ((gc_word_t)ATOMIC_LOAD(&threads[i].used) == GC_FREE_FULL) {
      if (ATOMIC_CAS_FULL(&threads[i].used, GC_FREE_FULL, GC_BUSY) != 0) {
        if (min == EPOCH_MAX)
          min = gc_compute_min(current_epoch());
        gc_cleanup_thread(i, min);
        ATOMIC_STORE(&threads[i].used, threads[i].head == NULL ? GC_FREE_EMPTY : GC_FREE_FULL);
      }
    }
  }
}

/*
 * Reset all epochs for all threads (must be called with all threads
 * stopped and out of transactions, e.g., upon roll-over).
 */
void gc_reset()
{
  int i;

  PRINT_DEBUG("==> gc_reset()\n");

  assert(nb_threads == 0);

  for (i = 0; i < MAX_THREADS; i++) {
    if (threads[i].used == GC_NULL)
      break;
    gc_clean_regions(threads[i].head);
    threads[i].ts = EPOCH_MAX;
    threads[i].head = threads[i].tail = NULL;
#ifndef NO_PERIODIC_CLEANUP
    threads[i].frees = 0;
#endif /* ! NO_PERIODIC_CLEANUP */
  }
}


================================================
FILE: stms/estm-0.3.0/src/gc.h
================================================
/*
 * File:
 *   gc.h
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * Description:
 *   Epoch-based garbage collector.
 *
 * Copyright (c) 2007-2009.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */

#ifndef _GC_H_
# define _GC_H_

# include <stdlib.h>

# ifdef __cplusplus
extern "C" {
# endif

typedef uintptr_t gc_word_t;

void gc_init(gc_word_t (*epoch)());
void gc_exit();

void gc_init_thread();
void gc_exit_thread();

void gc_set_epoch(gc_word_t epoch);

void gc_free(void *addr, gc_word_t epoch);

void gc_cleanup();

void gc_cleanup_all();

void gc_reset();

# ifdef __cplusplus
}
# endif

#endif /* _GC_H_ */


================================================
FILE: stms/estm-0.3.0/src/mod_local.c
================================================
/*
 * File:
 *   mod_local.c
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 * Description:
 *   Module for local memory accesses.
 *
 * Copyright (c) 2007-2009.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>

#include "mod_local.h"

#include "stm.h"

#ifndef LW_SET_SIZE
# define LW_SET_SIZE                    1024
#endif /* ! LW_SET_SIZE */

/* ################################################################### *
 * TYPES
 * ################################################################### */

enum {
  TYPE_WORD,
  TYPE_CHAR,
  TYPE_UCHAR,
  TYPE_SHORT,
  TYPE_USHORT,
  TYPE_INT,
  TYPE_UINT,
  TYPE_LONG,
  TYPE_ULONG,
  TYPE_FLOAT,
  TYPE_DOUBLE
};

typedef struct w_entry {                /* Write set entry */
  int type;                             /* Data type */
  union {                               /* Address written and old value */
    struct { volatile char *a; char v; } c;
    struct { volatile unsigned char *a; unsigned char v; } uc;
    struct { volatile short *a; short v; } s;
    struct { volatile unsigned short *a; unsigned short v; } us;
    struct { volatile int *a; int v; } i;
    struct { volatile unsigned int *a; unsigned int v; } ui;
    struct { volatile long *a; long v; } l;
    struct { volatile unsigned long *a; unsigned long v; } ul;
    struct { volatile float *a; float v; } f;
    struct { volatile double *a; double v; } d;
    struct { volatile stm_word_t *a; stm_word_t v; } w;
  } data;
} w_entry_t;

typedef struct w_set {                  /* Write set */
  w_entry_t *entries;                   /* Array of entries */
  int nb_entries;                       /* Number of entries */
  int size;                             /* Size of array */
} w_set_t;

static int key;
static int initialized = 0;

/* ################################################################### *
 * STATIC
 * ################################################################### */

/*
 * Called by the CURRENT thread to write to local memory.
 */
static inline w_entry_t *get_entry(TXPARAM)
{
  w_set_t *ws;

  if (!initialized) {
    fprintf(stderr, "Module mod_local not initialized\n");
    exit(1);
  }

  /* Store in undo log */
  ws = (w_set_t *)stm_get_specific(TXARGS key);
  assert(ws != NULL);

  if (ws->nb_entries == ws->size) {
    /* Extend read set */
    ws->size = (ws->size < LW_SET_SIZE ? LW_SET_SIZE : ws->size * 2);
    if ((ws->entries = (w_entry_t *)realloc(ws->entries, ws->size * sizeof(w_entry_t))) == NULL) {
      perror("realloc");
      exit(1);
    }
  }

  return &ws->entries[ws->nb_entries++];
}

/* ################################################################### *
 * FUNCTIONS
 * ################################################################### */

void stm_store_local(TXPARAMS stm_word_t *addr, stm_word_t value)
{
  w_entry_t *w = get_entry(TXARG);

  w->type = TYPE_WORD;
  w->data.w.a = addr;
  w->data.w.v = *addr;

  /* Write to memory */
  *addr = value;
}

void stm_store_local_char(TXPARAMS char *addr, char value)
{
  w_entry_t *w = get_entry(TXARG);

  w->type = TYPE_CHAR;
  w->data.c.a = addr;
  w->data.c.v = *addr;

  /* Write to memory */
  *addr = value;
}

void stm_store_local_uchar(TXPARAMS unsigned char *addr, unsigned char value)
{
  w_entry_t *w = get_entry(TXARG);

  w->type = TYPE_UCHAR;
  w->data.uc.a = addr;
  w->data.uc.v = *addr;

  /* Write to memory */
  *addr = value;
}

void stm_store_local_short(TXPARAMS short *addr, short value)
{
  w_entry_t *w = get_entry(TXARG);

  w->type = TYPE_SHORT;
  w->data.s.a = addr;
  w->data.s.v = *addr;

  /* Write to memory */
  *addr = value;
}

void stm_store_local_ushort(TXPARAMS unsigned short *addr, unsigned short value)
{
  w_entry_t *w = get_entry(TXARG);

  w->type = TYPE_USHORT;
  w->data.us.a = addr;
  w->data.us.v = *addr;

  /* Write to memory */
  *addr = value;
}

void stm_store_local_int(TXPARAMS int *addr, int value)
{
  w_entry_t *w = get_entry(TXARG);

  w->type = TYPE_INT;
  w->data.i.a = addr;
  w->data.i.v = *addr;

  /* Write to memory */
  *addr = value;
}

void stm_store_local_uint(TXPARAMS unsigned int *addr, unsigned int value)
{
  w_entry_t *w = get_entry(TXARG);

  w->type = TYPE_UINT;
  w->data.ui.a = addr;
  w->data.ui.v = *addr;

  /* Write to memory */
  *addr = value;
}

void stm_store_local_long(TXPARAMS long *addr, long value)
{
  w_entry_t *w = get_entry(TXARG);

  w->type = TYPE_LONG;
  w->data.l.a = addr;
  w->data.l.v = *addr;

  /* Write to memory */
  *addr = value;
}

void stm_store_local_ulong(TXPARAMS unsigned long *addr, unsigned long value)
{
  w_entry_t *w = get_entry(TXARG);

  w->type = TYPE_ULONG;
  w->data.ul.a = addr;
  w->data.ul.v = *addr;

  /* Write to memory */
  *addr = value;
}

void stm_store_local_float(TXPARAMS float *addr, float value)
{
  w_entry_t *w = get_entry(TXARG);

  w->type = TYPE_FLOAT;
  w->data.f.a = addr;
  w->data.f.v = *addr;

  /* Write to memory */
  *addr = value;
}

void stm_store_local_double(TXPARAMS double *addr, double value)
{
  w_entry_t *w = get_entry(TXARG);

  w->type = TYPE_DOUBLE;
  w->data.d.a = addr;
  w->data.d.v = *addr;

  /* Write to memory */
  *addr = value;
}

void stm_store_local_ptr(TXPARAMS void **addr, void *value)
{
  union { stm_word_t w; void *v; } convert;
  convert.v = value;
  stm_store_local(TXARGS (stm_word_t *)addr, convert.w);
}

/*
 * Called upon thread creation.
 */
static void on_thread_init(TXPARAMS void *arg)
{
  w_set_t *ws;

  if ((ws = (w_set_t *)malloc(sizeof(w_set_t))) == NULL) {
    perror("malloc");
    exit(1);
  }
  ws->entries = NULL;
  ws->nb_entries = ws->size = 0;

  stm_set_specific(TXARGS key, ws);
}

/*
 * Called upon thread deletion.
 */
static void on_thread_exit(TXPARAMS void *arg)
{
  w_set_t *ws;

  ws = (w_set_t *)stm_get_specific(TXARGS key);
  assert(ws != NULL);

  free(ws->entries);
  free(ws);
}

/*
 * Called upon transaction commit.
 */
static void on_commit(TXPARAMS void *arg)
{
  w_set_t *ws;

  ws = (w_set_t *)stm_get_specific(TXARGS key);
  assert(ws != NULL);

  /* Erase undo log */
  ws->nb_entries = 0;
}

/*
 * Called upon transaction abort.
 */
static void on_abort(TXPARAMS void *arg)
{
  w_set_t *ws;
  int i;

  ws = (w_set_t *)stm_get_specific(TXARGS key);
  assert(ws != NULL);

  /* Apply undo log */
  for (i = ws->nb_entries - 1; i >= 0; i--) {
    switch (ws->entries[i].type) {
     case TYPE_WORD:
       *ws->entries[i].data.w.a = ws->entries[i].data.w.v;
       break;
     case TYPE_CHAR:
       *ws->entries[i].data.c.a = ws->entries[i].data.c.v;
       break;
     case TYPE_UCHAR:
       *ws->entries[i].data.uc.a = ws->entries[i].data.uc.v;
       break;
     case TYPE_SHORT:
       *ws->entries[i].data.s.a = ws->entries[i].data.s.v;
       break;
     case TYPE_USHORT:
       *ws->entries[i].data.us.a = ws->entries[i].data.us.v;
       break;
     case TYPE_INT:
       *ws->entries[i].data.i.a = ws->entries[i].data.i.v;
       break;
     case TYPE_UINT:
       *ws->entries[i].data.ui.a = ws->entries[i].data.ui.v;
       break;
     case TYPE_LONG:
       *ws->entries[i].data.l.a = ws->entries[i].data.l.v;
       break;
     case TYPE_ULONG:
       *ws->entries[i].data.ul.a = ws->entries[i].data.ul.v;
       break;
     case TYPE_FLOAT:
       *ws->entries[i].data.f.a = ws->entries[i].data.f.v;
       break;
     case TYPE_DOUBLE:
       *ws->entries[i].data.d.a = ws->entries[i].data.d.v;
       break;
     default:
       fprintf(stderr, "Unexpected entry in undo log\n");
       abort();
       exit(1);
    }
  }
}

/*
 * Initialize module.
 */
void mod_local_init()
{
  if (initialized)
    return;

  stm_register(on_thread_init, on_thread_exit, NULL, on_commit, on_abort, NULL);
  key = stm_create_specific();
  if (key < 0) {
    fprintf(stderr, "Cannot create specific key\n");
    exit(1);
  }
  initialized = 1;
}


================================================
FILE: stms/estm-0.3.0/src/mod_mem.c
================================================
/*
 * File:
 *   mod_mem.c
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Vincent Gramoli <vincent.gramoli@epfl.ch>
 * Description:
 *   Module for dynamic memory management.
 *
 * Copyright (c) 2007-2009.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>

#include "mod_mem.h"

#include "gc.h"
#include "stm.h"

/* ################################################################### *
 * TYPES
 * ################################################################### */

typedef struct mem_block {              /* Block of allocated memory */
  void *addr;                           /* Address of memory */
  struct mem_block *next;               /* Next block */
} mem_block_t;

typedef struct mem_info {               /* Memory descriptor */
  mem_block_t *allocated;               /* Memory allocated by this transation (freed upon abort) */
  mem_block_t *freed;                   /* Memory freed by this transation (freed upon commit) */
} mem_info_t;

static int key;
static int initialized = 0;

/* ################################################################### *
 * FUNCTIONS
 * ################################################################### */

/*
 * Called by the CURRENT thread to allocate memory within a transaction.
 */
void *stm_malloc(TXPARAMS size_t size)
{
  /* Memory will be freed upon abort */
  mem_info_t *mi;
  mem_block_t *mb;

  if (!initialized) {
    fprintf(stderr, "Module mod_mem not initialized\n");
    exit(1);
  }

  mi = (mem_info_t *)stm_get_specific(TXARGS key);
  assert(mi != NULL);

  /* Round up size */
  if (sizeof(stm_word_t) == 4) {
    size = (size + 3) & ~(size_t)0x03;
  } else {
    size = (size + 7) & ~(size_t)0x07;
  }

  if ((mb = (mem_block_t *)malloc(sizeof(mem_block_t))) == NULL) {
    perror("malloc");
    exit(1);
  }
  if ((mb->addr = malloc(size)) == NULL) {
    perror("malloc");
    exit(1);
  }
  mb->next = mi->allocated;
  mi->allocated = mb;

  return mb->addr;
}

/*
 * Called by the CURRENT thread to free memory within a transaction.
 */
void stm_free(TXPARAMS void *addr, size_t size)
{
  stm_free2(TXARGS addr, 0, size);
}

/*
 * Called by the CURRENT thread to free memory within a transaction.
 */
void stm_free2(TXPARAMS void *addr, size_t idx, size_t size)
{
  /* Memory disposal is delayed until commit */
  mem_info_t *mi;
  mem_block_t *mb;
  stm_word_t *a;

  if (!initialized) {
    fprintf(stderr, "Module mod_mem not initialized\n");
    exit(1);
  }

  mi = (mem_info_t *)stm_get_specific(TXARGS key);
  assert(mi != NULL);

  /* TODO: if block allocated in same transaction => no need to overwrite */
  if (size > 0) {
    /* Overwrite to prevent inconsistent reads */
    if (sizeof(stm_word_t) == 4) {
      idx = (idx + 3) >> 2;
      size = (size + 3) >> 2;
    } else {
      idx = (idx + 7) >> 3;
      size = (size + 7) >> 3;
    }
    a = (stm_word_t *)addr + idx;
    while (size-- > 0) {
      /* Acquire lock and update version number */
	  stm_store2(TXARGS a++, 0, 0);
    }
  }
  /* Schedule for removal */
  if ((mb = (mem_block_t *)malloc(sizeof(mem_block_t))) == NULL) {
    perror("malloc");
    exit(1);
  }
  mb->addr = addr;
  mb->next = mi->freed;
  mi->freed = mb;
}

/*
 * Called upon thread creation.
 */
static void on_thread_init(TXPARAMS void *arg)
{
  mem_info_t *mi;

  if ((mi = (mem_info_t *)malloc(sizeof(mem_info_t))) == NULL) {
    perror("malloc");
    exit(1);
  }
  mi->allocated = mi->freed = NULL;

  stm_set_specific(TXARGS key, mi);
}

/*
 * Called upon thread deletion.
 */
static void on_thread_exit(TXPARAMS void *arg)
{
  free(stm_get_specific(TXARGS key));
}

/*
 * Called upon transaction commit.
 */
static void on_commit(TXPARAMS void *arg)
{
  mem_info_t *mi;
  mem_block_t *mb, *next;

  mi = (mem_info_t *)stm_get_specific(TXARGS key);
  assert(mi != NULL);

  /* Keep memory allocated during transaction */
  if (mi->allocated != NULL) {
    mb = mi->allocated;
    while (mb != NULL) {
      next = mb->next;
      free(mb);
      mb = next;
    }
    mi->allocated = NULL;
  }

  /* Dispose of memory freed during transaction */
  if (mi->freed != NULL) {
#ifdef EPOCH_GC
    stm_word_t t = stm_get_clock();
#endif /* EPOCH_GC */
    mb = mi->freed;
    while (mb != NULL) {
      next = mb->next;
#ifdef EPOCH_GC
      gc_free(mb->addr, t);
#else /* ! EPOCH_GC */
      free(mb->addr);
#endif /* ! EPOCH_GC */
      free(mb);
      mb = next;
    }
    mi->freed = NULL;
  }
}

/*
 * Called upon transaction abort.
 */
static void on_abort(TXPARAMS void *arg)
{
  mem_info_t *mi;
  mem_block_t *mb, *next;

  mi = (mem_info_t *)stm_get_specific(TXARGS key);
  assert (mi != NULL);

  /* Dispose of memory allocated during transaction */
  if (mi->allocated != NULL) {
    mb = mi->allocated;
    while (mb != NULL) {
      next = mb->next;
      free(mb->addr);
      free(mb);
      mb = next;
    }
    mi->allocated = NULL;
  }

  /* Keep memory freed during transaction */
  if (mi->freed != NULL) {
    mb = mi->freed;
    while (mb != NULL) {
      next = mb->next;
      free(mb);
      mb = next;
    }
    mi->freed = NULL;
  }
}

/*
 * Initialize module.
 */
void mod_mem_init()
{
  if (initialized)
    return;

  stm_register(on_thread_init, on_thread_exit, NULL, on_commit, on_abort, NULL);
  key = stm_create_specific();
  if (key < 0) {
    fprintf(stderr, "Cannot create specific key\n");
    exit(1);
  }
  initialized = 1;
}


================================================
FILE: stms/estm-0.3.0/src/mod_print.c
================================================
/*
 * File:
 *   mod_print.c
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 * Description:
 *   Module to test callbacks.
 *
 * Copyright (c) 2007-2009.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */

#include "mod_print.h"

#include "stm.h"

/*
 * Called upon thread creation.
 */
static void on_thread_init(TXPARAMS void *arg)
{
  printf("==> on_thread_init()\n");
  fflush(NULL);
}

/*
 * Called upon thread deletion.
 */
static void on_thread_exit(TXPARAMS void *arg)
{
  printf("==> on_thread_exit()\n");
  fflush(NULL);
}

/*
 * Called upon transaction start.
 */
static void on_start(TXPARAMS void *arg)
{
  printf("==> on_start()\n");
  fflush(NULL);
}

/*
 * Called upon transaction commit.
 */
static void on_commit(TXPARAMS void *arg)
{
  printf("==> on_commit()\n");
  fflush(NULL);
}

/*
 * Called upon transaction abort.
 */
static void on_abort(TXPARAMS void *arg)
{
  printf("==> on_abort()\n");
  fflush(NULL);
}

/*
 * Initialize module.
 */
void mod_print_init()
{
  stm_register(on_thread_init, on_thread_exit, on_start, on_commit, on_abort, NULL);
}


================================================
FILE: stms/estm-0.3.0/src/mod_stats.c
================================================
/*
 * File:
 *   mod_stats.c
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 * Description:
 *   Module for gathering global statistics about transactions.
 *
 * Copyright (c) 2007-2009.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */

#include <assert.h>
#include <stdio.h>
#include <string.h>

#include "mod_stats.h"

#include "atomic.h"
#include "stm.h"

/* ################################################################### *
 * TYPES
 * ################################################################### */

typedef struct tx_stats {               /* Transaction statistics */
  unsigned long commits;                /* Total number of commits (cumulative) */
  unsigned long aborts;                 /* Total number of aborts (cumulative) */
  unsigned long retries;                /* Number of consecutive aborts (retries) */
  unsigned long max_retries;            /* Maximum number of consecutive aborts (retries) */
} tx_stats_t;

static int key;
static int initialized = 0;

static tx_stats_t global_stats = { 0, 0, 0, 0 };

/* ################################################################### *
 * FUNCTIONS
 * ################################################################### */

/*
 * Return aggregate statistics about transactions.
 */
int stm_get_global_stats(const char *name, void *val)
{
  if (!initialized) {
    fprintf(stderr, "Module mod_stats not initialized\n");
    exit(1);
  }

  if (strcmp("global_nb_commits", name) == 0) {
    *(unsigned long *)val = global_stats.commits;
    return 1;
  }
  if (strcmp("global_nb_aborts", name) == 0) {
    *(unsigned long *)val = global_stats.aborts;
    return 1;
  }
  if (strcmp("global_max_retries", name) == 0) {
    *(unsigned long *)val = global_stats.max_retries;
    return 1;
  }

  return 0;
}

/*
 * Return statistics about current thread.
 */
int stm_get_local_stats(TXPARAMS const char *name, void *val)
{
  tx_stats_t *stats;

  if (!initialized) {
    fprintf(stderr, "Module mod_stats not initialized\n");
    exit(1);
  }

  stats = (tx_stats_t *)stm_get_specific(TXARGS key);
  assert(stats != NULL);

  if (strcmp("nb_commits", name) == 0) {
    *(unsigned long *)val = stats->commits;
    return 1;
  }
  if (strcmp("nb_aborts", name) == 0) {
    *(unsigned long *)val = stats->aborts;
    return 1;
  }
  if (strcmp("max_retries", name) == 0) {
    *(unsigned long *)val = stats->max_retries;
    return 1;
  }

  return 0;
}

/*
 * Called upon thread creation.
 */
static void on_thread_init(TXPARAMS void *arg)
{
  tx_stats_t *stats;

  if ((stats = (tx_stats_t *)malloc(sizeof(tx_stats_t))) == NULL) {
    perror("malloc");
    exit(1);
  }
  stats->commits = 0;
  stats->aborts = 0;
  stats->retries = 0;
  stats->max_retries = 0;

  stm_set_specific(TXARGS key, stats);
}

/*
 * Called upon thread deletion.
 */
static void on_thread_exit(TXPARAMS void *arg)
{
  tx_stats_t *stats;
  unsigned long max;

  stats = (tx_stats_t *)stm_get_specific(TXARGS key);
  assert(stats != NULL);

  ATOMIC_FETCH_ADD_FULL(&global_stats.commits, stats->commits);
  ATOMIC_FETCH_ADD_FULL(&global_stats.aborts, stats->aborts);
 retry:
  max = ATOMIC_LOAD(&global_stats.max_retries);
  if (stats->max_retries > max) {
    if (ATOMIC_CAS_FULL(&global_stats.max_retries, max, stats->max_retries) == 0)
      goto retry;
  }

  free(stats);
}

/*
 * Called upon transaction commit.
 */
static void on_commit(TXPARAMS void *arg)
{
  tx_stats_t *stats;

  stats = (tx_stats_t *)stm_get_specific(TXARGS key);
  assert(stats != NULL);

  stats->commits++;
  stats->retries = 0;
}

/*
 * Called upon transaction abort.
 */
static void on_abort(TXPARAMS void *arg)
{
  tx_stats_t *stats;

  stats = (tx_stats_t *)stm_get_specific(TXARGS key);
  assert(stats != NULL);

  stats->aborts++;
  stats->retries++;
  if (stats->max_retries < stats->retries)
    stats->max_retries = stats->retries;
}

/*
 * Initialize module.
 */
void mod_stats_init()
{
  if (initialized)
    return;

  stm_register(on_thread_init, on_thread_exit, NULL, on_commit, on_abort, NULL);
  key = stm_create_specific();
  if (key < 0) {
    fprintf(stderr, "Cannot create specific key\n");
    exit(1);
  }
  initialized = 1;
}


================================================
FILE: stms/estm-0.3.0/src/stm.c
================================================
/*
 * File:
 *   stm.c
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Vincent Gramoli <vincent.gramoli@epfl.ch>
 * Description:
 *   STM functions.
 *
 * Copyright (c) 2007-2009.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */

#include <assert.h>
#include <signal.h>
#include <stdio.h>
#include <string.h>

#include <pthread.h>
#include <sched.h>

#include "stm.h"
#include "atomic.h"
#include "gc.h"

/* ################################################################### *
 * DEFINES
 * ################################################################### */

#define COMPILE_TIME_ASSERT(pred)       switch (0) { case 0: case pred: ; }

/* Designs */
#define WRITE_BACK_ETL                  0
#define WRITE_BACK_CTL                  1
#define WRITE_THROUGH                   2

#define EL				0
#define NL				1

#ifdef EXPLICIT_TX_PARAMETER
# define TX_RETURN                      return tx
# define TX_GET                         /* Nothing */
#else /* ! EXPLICIT_TX_PARAMETER */
# define TX_RETURN                      /* Nothing */
# define TX_GET                         stm_tx_t *tx = stm_get_tx()
#endif /* ! EXPLICIT_TX_PARAMETER */

# define IO_FLUSH
# define PRINT_DEBUG(...)
# define PRINT_DEBUG2(...)

#ifndef RW_SET_SIZE
# define RW_SET_SIZE                    4096                /* Initial size of read/write sets */
#endif /* ! RW_SET_SIZE */

#ifndef LOCK_ARRAY_LOG_SIZE
# define LOCK_ARRAY_LOG_SIZE            20                  /* Size of lock array: 2^20 = 1M */
#endif /* LOCK_ARRAY_LOG_SIZE */

#ifndef LOCK_SHIFT_EXTRA
# define LOCK_SHIFT_EXTRA               2                   /* 2 extra shift */
#endif /* LOCK_SHIFT_EXTRA */

#define XSTR(s)                         STR(s)
#define STR(s)                          #s

/* ################################################################### *
 * TYPES
 * ################################################################### */

enum {                                  /* Transaction status */
  TX_IDLE = 0,
  TX_ACTIVE = 1,
  TX_COMMITTED = 2,
  TX_ABORTED = 3
};

typedef struct r_entry {                /* Read set entry */
  stm_word_t version;                   /* Version read */
  volatile stm_word_t *lock;            /* Pointer to lock (for fast access) */
} r_entry_t;

typedef struct r_set {                  /* Read set */
  r_entry_t *entries;                   /* Array of entries */
  int nb_entries;                       /* Number of entries */
  int size;                             /* Size of array */
} r_set_t;

typedef struct w_entry {                /* Write set entry */
  union {                               /* For padding... */
    struct {
      volatile stm_word_t *addr;        /* Address written */
      stm_word_t value;                 /* New (write-back) or old (write-through) value */
      stm_word_t mask;                  /* Write mask */
      stm_word_t version;               /* Version overwritten */
      volatile stm_word_t *lock;        /* Pointer to lock (for fast access) */
      struct w_entry *next;             /* Next address covered by same lock (if any) */
    };
  };
} w_entry_t;

typedef struct w_set {                  /* Write set */
  w_entry_t *entries;                   /* Array of entries */
  int nb_entries;                       /* Number of entries */
  int size;                             /* Size of array */
  int reallocate;                       /* Reallocate on next start */
} w_set_t;

#define ELASTICITY                      2
#ifndef MAX_SPECIFIC
# define MAX_SPECIFIC                   16
#endif /* MAX_SPECIFIC */

typedef struct stm_tx {                 /* Transaction descriptor */
  stm_tx_attr_t *attr;                  /* Transaction attributes (user-specified) */
  stm_word_t status;                    /* Transaction status (not read by other threads) */
  stm_word_t start;                     /* Start timestamp */
  stm_word_t end;                       /* End timestamp (validity range) */
  r_set_t r_set;                        /* Read set */
  w_set_t w_set;                        /* Write set */
  sigjmp_buf env;                       /* Environment for setjmp/longjmp */
  sigjmp_buf *jmp;                      /* Pointer to environment (NULL when not using setjmp/longjmp) */
  int nesting;                          /* Nesting level */
  int ro;                               /* Is this execution read-only? */
  int can_extend;                       /* Can this transaction be extended? */
  void *data[MAX_SPECIFIC];             /* Transaction-specific data (fixed-size array for better speed) */
  unsigned long retries;                /* Number of consecutive aborts (retries) */
  unsigned long aborts;                 /* Total number of aborts (cumulative) */
  unsigned long aborts_ro;              /* Aborts due to wrong read-only specification (cumulative) */
  unsigned long aborts_locked_read;     /* Aborts due to trying to read when locked (cumulative) */
  unsigned long aborts_locked_write;    /* Aborts due to trying to write when locked (cumulative) */
  unsigned long aborts_validate_read;   /* Aborts due to failed validation upon read (cumulative) */
  unsigned long aborts_validate_write;  /* Aborts due to failed validation upon write (cumulative) */
  unsigned long aborts_validate_commit; /* Aborts due to failed validation upon commit (cumulative) */
  unsigned long aborts_invalid_memory;  /* Aborts due to invalid memory access (cumulative) */
  unsigned long aborts_double_write;    /* Aborts due to impossible cut of elastic tx */ 
  unsigned long aborts_reallocate;      /* Aborts due to write set reallocation (cumulative) */
  unsigned long aborts_rollover;        /* Aborts due to clock rolling over (cumulative) */
  unsigned long max_retries;            /* Maximum number of consecutive aborts (retries) */
  int type;				/* Is this transaction normal (NL=1) or elastic (EL=0)? */
  stm_word_t *lastraddr[ELASTICITY];	/* Elastic rotating buffer, keep track of the last read values (if elastic) */
  int marker;                           /* Marker for the elastic rotating buffer */
} stm_tx_t;

static int nb_specific = 0;             /* Number of specific slots used (<= MAX_SPECIFIC) */

/*
 * Transaction nesting is supported in a minimalist way (flat nesting):
 * - When a transaction is started in the context of another
 *   transaction, we simply increment a nesting counter but do not
 *   actually start a new transaction.
 * - The environment to be used for setjmp/longjmp is only returned when
 *   no transaction is active so that it is not overwritten by nested
 *   transactions. This allows for composability as the caller does not
 *   need to know whether it executes inside another transaction.
 * - The commit of a nested transaction simply decrements the nesting
 *   counter. Only the commit of the top-level transaction will actually
 *   carry through updates to shared memory.
 * - An abort of a nested transaction will rollback the top-level
 *   transaction and reset the nesting counter. The call to longjmp will
 *   restart execution before the top-level transaction.
 * Using nested transactions without setjmp/longjmp is not recommended
 * as one would need to explicitly jump back outside of the top-level
 * transaction upon abort of a nested transaction. This breaks
 * composability.
 */

/*
 * Reading from the previous version of locked addresses is implemented
 * by peeking into the write set of the transaction that owns the
 * lock. Each transaction has a unique identifier, updated even upon
 * retry. A special "commit" bit of this identifier is set upon commit,
 * right before writing the values from the redo log to shared memory. A
 * transaction can read a locked address if the identifier of the owner
 * does not change between before and after reading the value and
 * version, and it does not have the commit bit set.
 */

/* ################################################################### *
 * CALLBACKS
 * ################################################################### */

typedef struct cb_entry {               /* Callback entry */
  void (*f)(TXPARAMS void *);           /* Function */
  void *arg;                            /* Argument to be passed to function */
} cb_entry_t;

#define MAX_CB                          16

/* Declare as static arrays (vs. lists) to improve cache locality */
static cb_entry_t init_cb[MAX_CB];      /* Init thread callbacks */
static cb_entry_t exit_cb[MAX_CB];      /* Exit thread callbacks */
static cb_entry_t start_cb[MAX_CB];     /* Start callbacks */
static cb_entry_t commit_cb[MAX_CB];    /* Commit callbacks */
static cb_entry_t abort_cb[MAX_CB];     /* Abort callbacks */

static int nb_init_cb = 0;
static int nb_exit_cb = 0;
static int nb_start_cb = 0;
static int nb_commit_cb = 0;
static int nb_abort_cb = 0;

/* ################################################################### *
 * THREAD-LOCAL
 * ################################################################### */

#ifdef TLS
static __thread stm_tx_t* thread_tx;
#else /* ! TLS */
static pthread_key_t thread_tx;
#endif /* ! TLS */

/* ################################################################### *
 * LOCKS
 * ################################################################### */

/*
 * A lock is a unsigned int of the size of a pointer.
 * The LSB is the lock bit. If it is set, this means:
 * - At least some covered memory addresses is being written.
 * - Write-back (ETL): all bits of the lock apart from the lock bit form
 *   a pointer that points to the write log entry holding the new
 *   value. Multiple values covered by the same log entry and orginized
 *   in a linked list in the write log.
 * - Write-through and write-back (CTL): all bits of the lock apart from
 *   the lock bit form a pointer that points to the transaction
 *   descriptor containing the write-set.
 * If the lock bit is not set, then:
 * - All covered memory addresses contain consistent values.
 * - Write-back (ETL and CTL): all bits of the lock besides the lock bit
 *   contain a version number (timestamp).
 * - Write-through: all bits of the lock besides the lock bit contain a
 *   version number.
 *   - The high order bits contain the commit time.
 *   - The low order bits contain an incarnation number (incremented
 *     upon abort while writing the covered memory addresses).
 * When using the PRIORITY contention manager, the format of locks is
 * slightly different. It is documented elsewhere.
 */

#define OWNED_MASK                      0x01                /* 1 bit */
#define VERSION_MAX                    (~(stm_word_t)0 >> 1)

#define LOCK_GET_OWNED(l)               (l & OWNED_MASK)
#define LOCK_SET_ADDR(a)               (a | OWNED_MASK)    /* OWNED bit set */
#define LOCK_GET_ADDR(l)               (l & ~(stm_word_t)OWNED_MASK)
#define LOCK_GET_TIMESTAMP(l)          (l >> 1)            /* Logical shift (unsigned) */
#define LOCK_SET_TIMESTAMP(t)          (t << 1)            /* OWNED bit not set */
#define LOCK_UNIT                       (~(stm_word_t)0)

/*
 * We use the very same hash functions as TL2 for degenerate Bloom
 * filters on 32 bits.
 */

/*
 * We use an array of locks and hash the address to find the location of the lock.
 * We try to avoid collisions as much as possible (two addresses covered by the same lock).
 */
#define LOCK_ARRAY_SIZE                 (1 << LOCK_ARRAY_LOG_SIZE)
#define LOCK_MASK                       (LOCK_ARRAY_SIZE - 1)
#define LOCK_SHIFT                      (((sizeof(stm_word_t) == 4) ? 2 : 3) + LOCK_SHIFT_EXTRA)
#define LOCK_IDX(a)                     (((stm_word_t)(a) >> LOCK_SHIFT) & LOCK_MASK)
#define GET_LOCK(a)                    (locks + LOCK_IDX(a))

static volatile stm_word_t locks[LOCK_ARRAY_SIZE];

/* ################################################################### *
 * CLOCK
 * ################################################################### */

/* At least twice a cache line (512 bytes to be on the safe side) */
static volatile stm_word_t gclock[1024 / sizeof(stm_word_t)];
#define CLOCK                          (gclock[512 / sizeof(stm_word_t)])

#define GET_CLOCK                       (ATOMIC_LOAD_ACQ(&CLOCK))
#define FETCH_INC_CLOCK                 (ATOMIC_FETCH_INC_FULL(&CLOCK))

/* ################################################################### *
 * STATIC
 * ################################################################### */

/*
 * Returns the transaction descriptor for the CURRENT thread.
 */
static inline stm_tx_t *stm_get_tx()
{
#ifdef TLS
  return thread_tx;
#else /* ! TLS */
  return (stm_tx_t *)pthread_getspecific(thread_tx);
#endif /* ! TLS */
}


/*
 * We use a simple approach for clock roll-over:
 * - We maintain the count of (active) transactions using a counter
 *   protected by a mutex. This approach is not very efficient but the
 *   cost is quickly amortized because we only modify the counter when
 *   creating and deleting a transaction descriptor, which typically
 *   happens much less often than starting and committing a transaction.
 * - We detect overflows when reading the clock or when incrementing it.
 *   Upon overflow, we wait until all threads have blocked on a barrier.
 * - Threads can block on the barrier upon overflow when they (1) start
 *   a transaction, or (2) delete a transaction. This means that threads
 *   must ensure that they properly delete their transaction descriptor
 *   before performing any blocking operation outside of a transaction
 *   in order to guarantee liveness (our model prohibits blocking
 *   inside a transaction).
 */

pthread_mutex_t tx_count_mutex;
pthread_cond_t tx_reset;
int tx_count;
int tx_overflow;

/*
 * Enter new transactional thread.
 */
static inline void stm_rollover_enter(stm_tx_t *tx)
{
  PRINT_DEBUG("==> stm_rollover_enter(%p)\n", tx);

  pthread_mutex_lock(&tx_count_mutex);
  while (tx_overflow != 0)
    pthread_cond_wait(&tx_reset, &tx_count_mutex);
  /* One more (active) transaction */
  tx_count++;
  pthread_mutex_unlock(&tx_count_mutex);
}

/*
 * Exit transactional thread.
 */
static inline void stm_rollover_exit(stm_tx_t *tx)
{
  PRINT_DEBUG("==> stm_rollover_exit(%p[%lu-%lu])\n", tx, (unsigned long)tx->start, (unsigned long)tx->end);

  pthread_mutex_lock(&tx_count_mutex);
  /* One less (active) transaction */
  tx_count--;
  assert(tx_count >= 0);
  /* Are all transactions stopped? */
  if (tx_overflow != 0 && tx_count == 0) {
    /* Yes: reset clock */
    memset((void *)locks, 0, LOCK_ARRAY_SIZE * sizeof(stm_word_t));
    CLOCK = 0;
    tx_overflow = 0;
    /* Reset GC */
    gc_reset();
    /* Wake up all thread */
    pthread_cond_broadcast(&tx_reset);
  }
  pthread_mutex_unlock(&tx_count_mutex);
}

/*
 * Clock overflow.
 */
static inline void stm_overflow(stm_tx_t *tx)
{
  PRINT_DEBUG("==> stm_overflow(%p[%lu-%lu])\n", tx, (unsigned long)tx->start, (unsigned long)tx->end);

  pthread_mutex_lock(&tx_count_mutex);
  /* Set overflow flag (might already be set) */
  tx_overflow = 1;
  /* One less (active) transaction */
  tx_count--;
  assert(tx_count >= 0);
  /* Are all transactions stopped? */
  if (tx_count == 0) {
    /* Yes: reset clock */
    memset((void *)locks, 0, LOCK_ARRAY_SIZE * sizeof(stm_word_t));
    CLOCK = 0;
    tx_overflow = 0;
    /* Reset GC */
    gc_reset();
    /* Wake up all thread */
    pthread_cond_broadcast(&tx_reset);
  } else {
    /* No: wait for other transactions to stop */
    pthread_cond_wait(&tx_reset, &tx_count_mutex);
  }
  /* One more (active) transaction */
  tx_count++;
  pthread_mutex_unlock(&tx_count_mutex);
}

/*
 * Check if stripe has been read previously.
 */
static inline r_entry_t *stm_has_read(stm_tx_t *tx, volatile stm_word_t *lock)
{
  r_entry_t *r;
  int i;

  PRINT_DEBUG("==> stm_has_read(%p[%lu-%lu],%p)\n", tx, (unsigned long)tx->start, (unsigned long)tx->end, lock);

  /* Check status */
  assert(tx->status == TX_ACTIVE);

  /* Look for read */
  r = tx->r_set.entries;
  for (i = tx->r_set.nb_entries; i > 0; i--, r++) {
    if (r->lock == lock) {
      /* Return first match*/
      return r;
    }
  }
  return NULL;
}

/*
 * (Re)allocate read set entries.
 */
static inline void stm_allocate_rs_entries(stm_tx_t *tx, int extend)
{
  if (extend) {
    /* Extend read set */
    tx->r_set.size *= 2;
    PRINT_DEBUG2("==> reallocate read set (%p[%lu-%lu],%d)\n", tx, (unsigned long)tx->start, (unsigned long)tx->end, tx->r_set.size);
    if ((tx->r_set.entries = (r_entry_t *)realloc(tx->r_set.entries, tx->r_set.size * sizeof(r_entry_t))) == NULL) {
      perror("realloc");
      exit(1);
    }
  } else {
    /* Allocate read set */
    if ((tx->r_set.entries = (r_entry_t *)malloc(tx->r_set.size * sizeof(r_entry_t))) == NULL) {
      perror("malloc");
      exit(1);
    }
  }
}

/*
 * (Re)allocate write set entries.
 */
static inline void stm_allocate_ws_entries(stm_tx_t *tx, int extend)
{

  if (extend) {
    /* Extend write set */
    tx->w_set.size *= 2;
    PRINT_DEBUG("==> reallocate write set (%p[%lu-%lu],%d)\n", tx, (unsigned long)tx->start, (unsigned long)tx->end, tx->w_set.size);
    if ((tx->w_set.entries = (w_entry_t *)realloc(tx->w_set.entries, tx->w_set.size * sizeof(w_entry_t))) == NULL) {
      perror("realloc");
      exit(1);
    }
  } else {
    /* Allocate write set */
    if ((tx->w_set.entries = (w_entry_t *)malloc(tx->w_set.size * sizeof(w_entry_t))) == NULL) {
      perror("malloc");
      exit(1);
    }
  }

}

/*
 * Validate read set (check if all read addresses are still valid now).
 */
static inline int stm_validate(stm_tx_t *tx)
{
  r_entry_t *r;
  int i;
  stm_word_t l;

  PRINT_DEBUG("==> stm_validate(%p[%lu-%lu])\n", tx, (unsigned long)tx->start, (unsigned long)tx->end);

  /* Check status */
  assert(tx->status == TX_ACTIVE);

  /* Validate reads */
  r = tx->r_set.entries;
  for (i = tx->r_set.nb_entries; i > 0; i--, r++) {
    /* Read lock */
    l = ATOMIC_LOAD(r->lock);
    /* Unlocked and still the same version? */
    if (LOCK_GET_OWNED(l)) {
      /* Do we own the lock? */
      w_entry_t *w = (w_entry_t *)LOCK_GET_ADDR(l);
      /* Simply check if address falls inside our write set (avoids non-faulting load) */
      if (!(tx->w_set.entries <= w && w < tx->w_set.entries + tx->w_set.nb_entries))
	{
	  /* Locked by another transaction: cannot validate */
	  return 0;
	}
      /* We own the lock: OK */
    } else {
      if (LOCK_GET_TIMESTAMP(l) != r->version) {
        /* Other version: cannot validate */
        return 0;
      }
      /* Same version: OK */
    }
  }
  return 1;
}

/*
 * Extend snapshot range.
 */
static inline int stm_extend(stm_tx_t *tx)
{
  stm_word_t now;

  PRINT_DEBUG("==> stm_extend(%p[%lu-%lu])\n", tx, (unsigned long)tx->start, (unsigned long)tx->end);

  /* Check status */
  assert(tx->status == TX_ACTIVE);

  /* Get current time */
  now = GET_CLOCK;
  if (now >= VERSION_MAX) {
    /* Clock overflow */
    return 0;
  }
  /* Try to validate read set */
  if (stm_validate(tx)) {
    /* It works: we can extend until now */
    tx->end = now;
    return 1;
  }
  return 0;
}

/*
 * Rollback transaction.
 */
static inline void stm_rollback(stm_tx_t *tx)
{
  w_entry_t *w;
  int i;

  PRINT_DEBUG("==> stm_rollback(%p[%lu-%lu])\n", tx, (unsigned long)tx->start, (unsigned long)tx->end);

  /* Check status */
  assert(tx->status == TX_ACTIVE);

  /* Drop locks */
  i = tx->w_set.nb_entries;
  if (i > 0) {
    w = tx->w_set.entries;
    for (; i > 0; i--, w++) {
      if (w->next == NULL) {
        /* Only drop lock for last covered address in write set */
        ATOMIC_STORE(w->lock, LOCK_SET_TIMESTAMP(w->version));
      }
      PRINT_DEBUG2("==> discard(t=%p[%lu-%lu],a=%p,d=%p-%lu,v=%lu)\n",
                   tx, (unsigned long)tx->start, (unsigned long)tx->end, w->addr, (void *)w->value, (unsigned long)w->value, (unsigned long)w->version);
    }
    /* Make sure that all lock releases become visible */
    ATOMIC_MB_WRITE;
  }

  tx->retries++;
  tx->aborts++;
  if (tx->max_retries < tx->retries)
    tx->max_retries = tx->retries;

  /* Callbacks */
  if (nb_abort_cb != 0) {
    int cb;
    for (cb = 0; cb < nb_abort_cb; cb++)
      abort_cb[cb].f(TXARGS abort_cb[cb].arg);
  }

  /* Set status (no need for CAS or atomic op) */
  tx->status = TX_ABORTED;

  /* Reset nesting level */
  tx->nesting = 0;


  /* Jump back to transaction start */
  if (tx->jmp != NULL)
    siglongjmp(*tx->jmp, 1);
}

/*
 * Store a word-sized value (return write set entry or NULL).
 */
static inline w_entry_t *stm_write(stm_tx_t *tx, volatile stm_word_t *addr, stm_word_t value, stm_word_t mask)
{
  volatile stm_word_t *lock;
  stm_word_t l, version;
  w_entry_t *w;
  w_entry_t *prev = NULL;

  PRINT_DEBUG2("==> stm_write(t=%p[%lu-%lu],a=%p,d=%p-%lu,m=0x%lx)\n",
               tx, (unsigned long)tx->start, (unsigned long)tx->end, addr, (void *)value, (unsigned long)value, (unsigned long)mask);

  /* Check status */
  assert(tx->status == TX_ACTIVE);

  if (tx->ro) {
    /* Disable read-only and abort */
    assert(tx->attr != NULL);
    tx->attr->ro = 0;
    tx->aborts_ro++;
    stm_rollback(tx);
    return NULL;
  }

  /* Get reference to lock */
  lock = GET_LOCK(addr);

  /* Try to acquire lock */
 restart:
  l = ATOMIC_LOAD_ACQ(lock);
  if (LOCK_GET_OWNED(l)) {
    /* Locked */
    /* Do we own the lock? */
    w = (w_entry_t *)LOCK_GET_ADDR(l);
    /* Simply check if address falls inside our write set (avoids non-faulting load) */
    if (tx->w_set.entries <= w && w < tx->w_set.entries + tx->w_set.nb_entries) {
      /* Yes */
      prev = w;
      /* Did we previously write the same address? */
      while (1) {
        if (addr == prev->addr) {
          if (mask == 0)
            return prev;
          /* No need to add to write set */
          PRINT_DEBUG2("==> stm_write(t=%p[%lu-%lu],a=%p,l=%p,*l=%lu,d=%p-%lu,m=0x%lx)\n",
                       tx, (unsigned long)tx->start, (unsigned long)tx->end, addr, lock, (unsigned long)l, (void *)value, (unsigned long)value, (unsigned long)mask);
          if (mask != ~(stm_word_t)0) {
            if (prev->mask == 0)
              prev->value = ATOMIC_LOAD(addr);
            value = (prev->value & ~mask) | (value & mask);
          }
          prev->value = value;
          prev->mask |= mask;
          return prev;
        }
        if (prev->next == NULL) {
          /* Remember last entry in linked list (for adding new entry) */
          break;
        }
        prev = prev->next;
      }
      /* Get version from previous write set entry (all entries in linked list have same version) */
      version = prev->version;
      /* Must add to write set */
      if (tx->w_set.nb_entries == tx->w_set.size) {
        /* Extend write set (invalidate pointers to write set entries => abort and reallocate) */
        tx->w_set.size *= 2;
        tx->w_set.reallocate = 1;
        tx->aborts_reallocate++;
        stm_rollback(tx);
        return NULL;
      }
      w = &tx->w_set.entries[tx->w_set.nb_entries];
      goto do_write;
    }
    /* Abort */
    tx->aborts_locked_write++;
    stm_rollback(tx);
    return NULL;
  } else {
    /* Not locked */
    /* Handle write after reads (before CAS) */
    version = LOCK_GET_TIMESTAMP(l);
    if (version > tx->end) {
      /* We might have read an older version previously */
      if (!tx->can_extend || stm_has_read(tx, lock) != NULL) {
        /* Read version must be older (otherwise, tx->end >= version) */
        /* Not much we can do: abort */
        tx->aborts_validate_write++;
        stm_rollback(tx);
        return NULL;
      }
    }
    /* Acquire lock (ETL) */
    if (tx->w_set.nb_entries == tx->w_set.size) {
      /* Extend write set (invalidate pointers to write set entries => abort and reallocate) */
      tx->w_set.size *= 2;
      tx->w_set.reallocate = 1;
      tx->aborts_reallocate++;
      stm_rollback(tx);
      return NULL;
    }
    w = &tx->w_set.entries[tx->w_set.nb_entries];
    if (ATOMIC_CAS_FULL(lock, l, LOCK_SET_ADDR((stm_word_t)w)) == 0)
      goto restart;
  }
  /* We own the lock here (ETL) */
 do_write:
  PRINT_DEBUG2("==> stm_write(t=%p[%lu-%lu],a=%p,l=%p,*l=%lu,d=%p-%lu,m=0x%lx)\n",
               tx, (unsigned long)tx->start, (unsigned long)tx->end, addr, lock, (unsigned long)l, (void *)value, (unsigned long)value, (unsigned long)mask);

  /* Add address to write set */
  w->addr = addr;
  w->mask = mask;
  w->lock = lock;
  if (mask == 0) {
    /* Do not write anything */
#ifndef NDEBUG
    w->value = 0;
#endif /* ! NDEBUG */
  } else
    {
      /* Remember new value */
      if (mask != ~(stm_word_t)0)
	value = (ATOMIC_LOAD(addr) & ~mask) | (value & mask);
      w->value = value;
    }
  w->version = version;
  w->next = NULL;
  if (prev != NULL) {
    /* Link new entry in list */
    prev->next = w;
  }
  tx->w_set.nb_entries++;

  return w;
}


/*
 * Catch signal (to emulate non-faulting load).
 */
static void signal_catcher(int sig)
{
  stm_tx_t *tx = stm_get_tx();

  /* A fault might only occur upon a load concurrent with a free (read-after-free) */
  PRINT_DEBUG("Caught signal: %d\n", sig);

  if (tx == NULL || tx->jmp == NULL) {
    /* There is not much we can do: execution will restart at faulty load */
    fprintf(stderr, "Error: invalid memory accessed and no longjmp destination\n");
    exit(1);
  }

  tx->aborts_invalid_memory++;
  /* Will cause a longjmp */
  stm_rollback(tx);
}

/* ################################################################### *
 * STM FUNCTIONS
 * ################################################################### */

/*
 * Called once (from main) to initialize STM infrastructure.
 */
void stm_init()
{
  struct sigaction act;

  PRINT_DEBUG("==> stm_init()\n");

  PRINT_DEBUG("\tsizeof(word)=%d\n", (int)sizeof(stm_word_t));

  PRINT_DEBUG("\tVERSION_MAX=0x%lx\n", (unsigned long)VERSION_MAX);

  COMPILE_TIME_ASSERT(sizeof(stm_word_t) == sizeof(void *));
  COMPILE_TIME_ASSERT(sizeof(stm_word_t) == sizeof(atomic_t));

  gc_init(stm_get_clock);

  memset((void *)locks, 0, LOCK_ARRAY_SIZE * sizeof(stm_word_t));


  CLOCK = 0;
  if (pthread_mutex_init(&tx_count_mutex, NULL) != 0) {
    fprintf(stderr, "Error creating mutex\n");
    exit(1);
  }
  if (pthread_cond_init(&tx_reset, NULL) != 0) {
    fprintf(stderr, "Error creating condition variable\n");
    exit(1);
  }
  tx_count = 0;
  tx_overflow = 0;

#ifndef TLS
  if (pthread_key_create(&thread_tx, NULL) != 0) {
    fprintf(stderr, "Error creating thread local\n");
    exit(1);
  }
#endif /* ! TLS */

  /* Catch signals for non-faulting load */
  act.sa_handler = signal_catcher;
  act.sa_flags = 0;
  sigemptyset(&act.sa_mask);
  if (sigaction(SIGBUS, &act, NULL) < 0 || sigaction(SIGSEGV, &act, NULL) < 0) {
    perror("sigaction");
    exit(1);
  }

}

/*
 * Called once (from main) to clean up STM infrastructure.
 */
void stm_exit()
{
  PRINT_DEBUG("==> stm_exit()\n");

#ifndef TLS
  pthread_key_delete(thread_tx);
#endif /* ! TLS */
  pthread_cond_destroy(&tx_reset);
  pthread_mutex_destroy(&tx_count_mutex);

  gc_exit();
}

/*
 * Called by the CURRENT thread to initialize thread-local STM data.
 */
TXTYPE stm_init_thread()
{
  stm_tx_t *tx;

  PRINT_DEBUG("==> stm_init_thread()\n");

  gc_init_thread();

  /* Allocate descriptor */
  if ((tx = (stm_tx_t *)malloc(sizeof(stm_tx_t))) == NULL) {
    perror("malloc");
    exit(1);
  }
  /* Set status (no need for CAS or atomic op) */
  tx->status = TX_IDLE;
  /* Read set */
  tx->r_set.nb_entries = 0;
  tx->r_set.size = RW_SET_SIZE;
  stm_allocate_rs_entries(tx, 0);
  /* Write set */
  tx->w_set.nb_entries = 0;
  tx->w_set.size = RW_SET_SIZE;
  tx->w_set.reallocate = 0;
  stm_allocate_ws_entries(tx, 0);
  /* Nesting level */
  tx->nesting = 0;
  /* Transaction-specific data */
  memset(tx->data, 0, MAX_SPECIFIC * sizeof(void *));
  tx->retries = 0;
  /* Statistics */
  tx->aborts = 0;
  tx->aborts_ro = 0;
  tx->aborts_locked_read = 0;
  tx->aborts_locked_write = 0;
  tx->aborts_validate_read = 0;
  tx->aborts_validate_write = 0;
  tx->aborts_validate_commit = 0;
  tx->aborts_invalid_memory = 0;
  tx->aborts_reallocate = 0;
  tx->aborts_rollover = 0;
  tx->max_retries = 0;
  /* Store as thread-local data */
#ifdef TLS
  thread_tx = tx;
#else /* ! TLS */
  pthread_setspecific(thread_tx, tx);
#endif /* ! TLS */
  stm_rollover_enter(tx);

  /* Callbacks */
  if (nb_init_cb != 0) {
    int cb;
    for (cb = 0; cb < nb_init_cb; cb++)
      init_cb[cb].f(TXARGS init_cb[cb].arg);
  }

  PRINT_DEBUG("==> %p\n", tx);

  TX_RETURN;
}

/*
 * Called by the CURRENT thread to cleanup thread-local STM data.
 */
void stm_exit_thread(TXPARAM)
{
  stm_word_t t;
  TX_GET;

  PRINT_DEBUG("==> stm_exit_thread(%p[%lu-%lu])\n", tx, (unsigned long)tx->start, (unsigned long)tx->end);

  /* Callbacks */
  if (nb_exit_cb != 0) {
    int cb;
    for (cb = 0; cb < nb_exit_cb; cb++)
      exit_cb[cb].f(TXARGS exit_cb[cb].arg);
  }

  stm_rollover_exit(tx);

  t = GET_CLOCK;
  gc_free(tx->r_set.entries, t);
  gc_free(tx->w_set.entries, t);
  gc_free(tx, t);
  gc_exit_thread();
}

/*
 * Called by the CURRENT thread to start an elastic transaction.
 */
static inline void stm_elastic_start(TXPARAMS sigjmp_buf *env, stm_tx_attr_t *attr)
{
  TX_GET;
  tx->type = EL;
  
  memset(tx->lastraddr, 0, ELASTICITY * sizeof(void *));
  tx->marker = 0;
  	
  PRINT_DEBUG("==> stm_elastic_start(%p)\n", tx);
	
  /* Increment nesting level */
  if (tx->nesting++ > 0)
    return;
	
  /* Use setjmp/longjmp? */
  tx->jmp = env;
  /* Attributes */
  tx->attr = attr;
  tx->ro = (attr == NULL ? 0 : attr->ro);
  /* Set status (no need for CAS or atomic op) */
  tx->status = TX_ACTIVE;
 start:
  /* Start timestamp */
  tx->start = tx->end = GET_CLOCK; /* OPT: Could be delayed until first read/write */
  /* Disallow extensions in elastic transactions */
  tx->can_extend = 0;
  if (tx->start >= VERSION_MAX) {
    /* Overflow: we must reset clock */
    stm_overflow(tx);
    goto start;
  }
  /* Read/write set */
  if (tx->w_set.reallocate) {
    /* Don't need to copy the content from the previous write set */
    gc_free(tx->w_set.entries, tx->start);
    stm_allocate_ws_entries(tx, 0);
    tx->w_set.reallocate = 0;
  }
  tx->w_set.nb_entries = 0;
  tx->r_set.nb_entries = 0;
	
  gc_set_epoch(tx->start);
	
  /* Callbacks */
  if (nb_start_cb != 0) {
    int cb;
    for (cb = 0; cb < nb_start_cb; cb++)
      start_cb[cb].f(TXARGS start_cb[cb].arg);
  }
}

/*
 * Called by the CURRENT thread to start a transaction.
 */
static inline void stm_normal_start(TXPARAMS sigjmp_buf *env, stm_tx_attr_t *attr)
{
  TX_GET;
  tx->type = NL;

  PRINT_DEBUG("==> stm_normal_start(%p)\n", tx);

  /* Increment nesting level */
  if (tx->nesting++ > 0)
    return;

  /* Use setjmp/longjmp? */
  tx->jmp = env;
  /* Attributes */
  tx->attr = attr;
  tx->ro = (attr == NULL ? 0 : attr->ro);
  /* Set status (no need for CAS or atomic op) */
  tx->status = TX_ACTIVE;
 start:
  /* Start timestamp */
  tx->start = tx->end = GET_CLOCK; /* OPT: Could be delayed until first read/write */
  /* Allow extensions */
  tx->can_extend = 1;
  if (tx->start >= VERSION_MAX) {
    /* Overflow: we must reset clock */
    stm_overflow(tx);
    goto start;
  }
  /* Read/write set */
  if (tx->w_set.reallocate) {
    /* Don't need to copy the content from the previous write set */
    gc_free(tx->w_set.entries, tx->start);
    stm_allocate_ws_entries(tx, 0);
    tx->w_set.reallocate = 0;
  }
  tx->w_set.nb_entries = 0;
  tx->r_set.nb_entries = 0;

  gc_set_epoch(tx->start);

  /* Callbacks */
  if (nb_start_cb != 0) {
    int cb;
    for (cb = 0; cb < nb_start_cb; cb++)
      start_cb[cb].f(TXARGS start_cb[cb].arg);
  }
}

void stm_start(TXPARAMS sigjmp_buf *env, stm_tx_attr_t *attr, int type) {
  if (type == NL) stm_normal_start(env, attr);
  else stm_elastic_start(env, attr);
}

/*
 * Called by the CURRENT thread to commit an elastic transaction.
 */
static inline int stm_elastic_commit(TXPARAM)
{
  TX_GET;
	
  PRINT_DEBUG("==> stm_elastic_commit(%p[%lu-%lu])\n", tx, (unsigned long)tx->start, (unsigned long)tx->end);
	
  /* Check status */
  assert(tx->status == TX_ACTIVE);

  /* Decrement nesting level */
  if (--tx->nesting > 0)
    return 1;
	
  tx->retries = 0;
		
  /* Callbacks */
  if (nb_commit_cb != 0) {
    int cb;
    for (cb = 0; cb < nb_commit_cb; cb++)
      commit_cb[cb].f(TXARGS commit_cb[cb].arg);
  }
	
  /* Set status (no need for CAS or atomic op) */
  tx->status = TX_COMMITTED;
	
  return 1;
}

/*
 * Called by the CURRENT thread to commit a transaction.
 */
static inline int stm_normal_commit(TXPARAM)
{
  w_entry_t *w;
  stm_word_t t;
  int i;
  TX_GET;

  PRINT_DEBUG("==> stm_normal_commit(%p[%lu-%lu])\n", tx, (unsigned long)tx->start, (unsigned long)tx->end);

  /* Check status */
  assert(tx->status == TX_ACTIVE);

  /* Decrement nesting level */
  if (--tx->nesting > 0)
    return 1;

  if (tx->w_set.nb_entries > 0) {
    /* Update transaction */


    /* Get commit timestamp */
    t = FETCH_INC_CLOCK + 1;
    if (t >= VERSION_MAX) {
      /* Abort: will reset the clock on next transaction start or delete */
      tx->aborts_rollover++;
      stm_rollback(tx);
      return 0;
    }

    /* Try to validate (only if a concurrent transaction has committed since tx->start) */
    if (tx->start != t - 1 && !stm_validate(tx)) {
      /* Cannot commit */
      tx->aborts_validate_commit++;
      stm_rollback(tx);
      return 0;
    }

    /* Install new versions, drop locks and set new timestamp */
    w = tx->w_set.entries;
    for (i = tx->w_set.nb_entries; i > 0; i--, w++) {
      if (w->mask != 0)
        ATOMIC_STORE(w->addr, w->value);
      /* Only drop lock for last covered address in write set */
      if (w->next == NULL)
        ATOMIC_STORE_REL(w->lock, LOCK_SET_TIMESTAMP(t));

      PRINT_DEBUG2("==> write(t=%p[%lu-%lu],a=%p,d=%p-%d,v=%d)\n",
                   tx, (unsigned long)tx->start, (unsigned long)tx->end, w->addr, (void *)w->value, (int)w->value, (int)w->version);
    }
  }

  tx->retries = 0;


  /* Callbacks */
  if (nb_commit_cb != 0) {
    int cb;
    for (cb = 0; cb < nb_commit_cb; cb++)
      commit_cb[cb].f(TXARGS commit_cb[cb].arg);
  }

  /* Set status (no need for CAS or atomic op) */
  tx->status = TX_COMMITTED;

  return 1;
}

int stm_commit(TXPARAM) {
  TX_GET;
  if (tx->type == NL) return stm_normal_commit();
  else return stm_elastic_commit();
}

/*
 * Called by the CURRENT thread to abort a transaction.
 */
void stm_abort(TXPARAM)
{
  TX_GET;
  stm_rollback(tx);
}

/* 
 * Read version value and same version to make sure the value corresponds to the version.
 */
static inline stm_word_t stm_vervalver(volatile stm_word_t *addr, stm_word_t *timestamp)
{
  volatile stm_word_t *lock;
  stm_word_t l, l2, value;
  w_entry_t *w;
	
  TX_GET;
		
  PRINT_DEBUG2("==> stm_vervalver(a=%p)\n", addr);
	
  /* Get reference to lock */
  lock = GET_LOCK(addr);
	
  /* Read lock, value, lock */
 restart:
  l = ATOMIC_LOAD_ACQ(lock);
 restart_no_load:
  if (LOCK_GET_OWNED(l)) {
		
    if ((stm_tx_t *)LOCK_GET_ADDR(l) == tx) {
      /* If the current tx owns the lock */
      w = (w_entry_t *)LOCK_GET_ADDR(l);
      /* Simply check if address falls inside our write set (avoids non-faulting load) */
      if (tx->w_set.entries <= w && w < tx->w_set.entries + tx->w_set.nb_entries) {
	/* Yes: did we previously write the same address? */
	while (1) {
	  if (addr == w->addr) {
	    /* Yes: get value from write set (or from memory if mask was empty) */
	    value = (w->mask == 0 ? ATOMIC_LOAD(addr) : w->value);
	    break;
	  }
	  if (w->next == NULL) {
	    /* No: get value from memory */
	    value = ATOMIC_LOAD(addr);
	    break;
	  }
	  w = w->next;
	}
	/* No need to add to read set (will remain valid) */
	PRINT_DEBUG2("==> stm_normal_load(t=%p[%lu-%lu],a=%p,l=%p,*l=%lu,d=%p-%lu)\n",
		     tx, (unsigned long)tx->start, (unsigned long)tx->end, addr, lock, (unsigned long)l, (void *)value, (unsigned long)value);
	return value;
      }
    }
		
    /* Locked: wait until lock is free */
    goto restart;
  }
  /* Not locked */
  value = ATOMIC_LOAD_ACQ(addr);
  l2 = ATOMIC_LOAD_ACQ(lock);
  if (l != l2) {
    l = l2;
    goto restart_no_load;
  }
	
  *timestamp = LOCK_GET_TIMESTAMP(l);
	
  PRINT_DEBUG2("==> stm_vervalver(a=%p,l=%p,*l=%lu,d=%p-%lu)\n",
	       addr, lock, (unsigned long)l, (void *)value, (unsigned long)value);
	
  return value;
}

/*
 * Called by the CURRENT thread in an elastic transaction to 
 * load a word-sized value.
 */
static inline stm_word_t stm_elastic_load(TXPARAMS volatile stm_word_t *x)
{
  int i;
  stm_word_t v_x;
  stm_word_t *y;
  stm_word_t ts_x = 0;
  stm_word_t ts_y = 0;

  TX_GET;
	
  /* Check status */
  assert(tx->status == TX_ACTIVE);
	
  PRINT_DEBUG2("==> stm_elastic_load(t=%p[%lu-%lu],a=%p)\n", tx, 
	       (unsigned long)tx->start, (unsigned long)tx->end, addr);
	
  v_x = stm_vervalver(x, &ts_x);
  
  if (ts_x > tx->start) {
    /* Check consistency of the rotating buffer read addresses */ 
    for (i=0; i<ELASTICITY; i++) {
      if ((y=tx->lastraddr[i])) {
	stm_vervalver(y, &ts_y);
	if (ts_y > tx->start) {
	  tx->aborts_ro++;
	  stm_rollback(tx);
	  return 0;
	}
      }
    }
    tx->start = ts_x;
  }

  /* Fill the elastic rotating buffer with the address read */ 
  tx->lastraddr[tx->marker] = (stm_word_t *)x;
  /* Increment the marker of the elastic rotating buffer */
  tx->marker = ++tx->marker % ELASTICITY;
  
  return v_x;
}

/*
 * Called by the CURRENT thread in a normal transaction to 
 * load a word-sized value.
 */
static inline stm_word_t stm_normal_load(TXPARAMS volatile stm_word_t *addr)
{
  volatile stm_word_t *lock;
  stm_word_t l, l2, value, version;
  r_entry_t *r;
  w_entry_t *w;
  TX_GET;

  PRINT_DEBUG2("==> stm_normal_load(t=%p[%lu-%lu],a=%p)\n", tx, (unsigned long)tx->start, (unsigned long)tx->end, addr);

  /* Check status */
  assert(tx->status == TX_ACTIVE);

  /* Get reference to lock */
  lock = GET_LOCK(addr);

  /* Note: we could check for duplicate reads and get value from read set */

  /* Read lock, value, lock */
  l = ATOMIC_LOAD_ACQ(lock);
 restart_no_load:
  if (LOCK_GET_OWNED(l)) {
    /* Locked */
    /* Do we own the lock? */
    w = (w_entry_t *)LOCK_GET_ADDR(l);
    /* Simply check if address falls inside our write set (avoids non-faulting load) */
    if (tx->w_set.entries <= w && w < tx->w_set.entries + tx->w_set.nb_entries) {
      /* Yes: did we previously write the same address? */
      while (1) {
        if (addr == w->addr) {
          /* Yes: get value from write set (or from memory if mask was empty) */
          value = (w->mask == 0 ? ATOMIC_LOAD(addr) : w->value);
          break;
        }
        if (w->next == NULL) {
          /* No: get value from memory */
          value = ATOMIC_LOAD(addr);
          break;
        }
        w = w->next;
      }
      /* No need to add to read set (will remain valid) */
      PRINT_DEBUG2("==> stm_normal_load(t=%p[%lu-%lu],a=%p,l=%p,*l=%lu,d=%p-%lu)\n",
                   tx, (unsigned long)tx->start, (unsigned long)tx->end, addr, lock, (unsigned long)l, (void *)value, (unsigned long)value);
      return value;
    }
    /* Abort */
    tx->aborts_locked_read++;
    stm_rollback(tx);
    return 0;
  } else {
    /* Not locked */
    value = ATOMIC_LOAD_ACQ(addr);
    l2 = ATOMIC_LOAD_ACQ(lock);
    if (l != l2) {
      l = l2;
      goto restart_no_load;
    }
    /* Check timestamp */
    version = LOCK_GET_TIMESTAMP(l);
    /* Valid version? */
    if (version > tx->end) {
      /* No: try to extend first (except for read-only transactions: no read set) */
      if (tx->ro || !tx->can_extend || !stm_extend(tx)) {
        /* Not much we can do: abort */
        tx->aborts_validate_read++;
        stm_rollback(tx);
        return 0;
      }
      /* Verify that version has not been overwritten (read value has not
       * yet been added to read set and may have not been checked during
       * extend) */
      l = ATOMIC_LOAD_ACQ(lock);
      if (l != l2) {
        l = l2;
        goto restart_no_load;
      }
      /* Worked: we now have a good version (version <= tx->end) */
    }
  }
  /* We have a good version: add to read set (update transactions) and return value */

  if (!tx->ro) {
    /* Add address and version to read set */
    if (tx->r_set.nb_entries == tx->r_set.size)
      stm_allocate_rs_entries(tx, 1);
    r = &tx->r_set.entries[tx->r_set.nb_entries++];
    r->version = version;
    r->lock = lock;
  }

  PRINT_DEBUG2("==> stm_normal_load(t=%p[%lu-%lu],a=%p,l=%p,*l=%lu,d=%p-%lu,v=%lu)\n",
               tx, (unsigned long)tx->start, (unsigned long)tx->end, addr, lock, (unsigned long)l, (void *)value, (unsigned long)value, (unsigned long)version);

  return value;
}

/*
 * Called by the CURRENT thread in an elastic transaction to 
 * load a word-sized value.
 */
stm_word_t stm_load(TXPARAMS volatile stm_word_t *addr) {
  TX_GET;
  if (tx->type == NL) return stm_normal_load(addr);
  else return stm_elastic_load(addr);
}

/*
 * Called by the CURRENT thread in an elastic transaction to store 
 * a word-sized value.
 */
static inline int stm_elastic_store(volatile stm_word_t *addr, stm_word_t value, stm_word_t mask)
{
  int i;
  stm_word_t ts_x;
  volatile stm_word_t *lock;
  stm_word_t l;
  w_entry_t *w;  
	
  TX_GET;
	
  PRINT_DEBUG2("==> stm_elastic_store(t=%p[%lu-%lu],a=%p)\n", tx, 
	       (unsigned long)tx->start, (unsigned long)tx->end, addr);
	
  tx->type = NL;
  stm_write(tx, addr, value, mask);

  /* Make sure last read values are unchanged */
  for (i=0; i<ELASTICITY; i++) {
    /* All addresses of the elastic rotating buffer must be consistent */
    if (tx->lastraddr[i]) {
      lock = GET_LOCK(tx->lastraddr[i]);
      l = ATOMIC_LOAD_ACQ(lock);
      if (LOCK_GET_OWNED(l)) {
	/* Is the buffer read address, the one we write? */
	w = (w_entry_t *)LOCK_GET_ADDR(l);                                                                                                                         
	/* Simply check if address falls inside our write set (avoids non-faulting load) */                                                                        
	if ((tx->w_set.entries <= w && w < tx->w_set.entries + tx->w_set.nb_entries))
	  return 1;  
	/* It is locked by another tx */			
	tx->aborts_locked_write++;
	stm_rollback(tx);
	return 0;
      }
      /* The read address of the rotating buffer has been seen unlocked */
      ts_x = (stm_word_t)LOCK_GET_TIMESTAMP(l);
      if (ts_x > tx->start) {
	/* The read address of the rotating buffer has changed */
	tx->aborts_validate_write++;
	stm_rollback(tx);
	return 0;
      }
    }
   
    /* Do not recheck the elastic rotating buffer in further stores */
    memset(tx->lastraddr, 0, ELASTICITY * sizeof(void *));
  }	
  return 1;
}

/*
 * CURRENT thread in a normal transaction to store 
 * a word-sized value.
 */
static inline void stm_normal_store(volatile stm_word_t *addr, stm_word_t value)
{	
  TX_GET;
  stm_write(tx, addr, value, ~(stm_word_t)0);
}

/*
 * Called by the CURRENT thread in a transaction to store a 
 * word-sized value.
 */
void stm_store(TXPARAMS volatile stm_word_t *addr, stm_word_t value)
{
  TX_GET;
  if (tx->type == EL) stm_elastic_store(addr, value, ~(stm_word_t)0);
  else stm_write(tx, addr, value, ~(stm_word_t)0);
}

/*
 * Called by the CURRENT thread in a normal transaction to store part
 * a word-sized value.
 */
void stm_store2(TXPARAMS volatile stm_word_t *addr, stm_word_t value, stm_word_t mask)
{
  TX_GET;
  if (tx->type == EL) stm_elastic_store(addr, value, mask);
  else stm_write(tx, addr, value, mask);
}

/*
 * Called by the CURRENT thread to inquire about the status of a transaction.
 */
int stm_active(TXPARAM)
{
  TX_GET;

  return (tx->status == TX_ACTIVE);
}

/*
 * Called by the CURRENT thread to inquire about the status of a transaction.
 */
int stm_aborted(TXPARAM)
{
  TX_GET;

  return (tx->status == TX_ABORTED);
}

/*
 * Called by the CURRENT thread to obtain an environment for setjmp/longjmp.
 */
sigjmp_buf *stm_get_env(TXPARAM)
{
  TX_GET;

  /* Only return environment for top-level transaction */
  return tx->nesting == 0 ? &tx->env : NULL;
}

/*
 * Get transaction attributes.
 */
stm_tx_attr_t *stm_get_attributes(TXPARAM)
{
  TX_GET;

  return tx->attr;
}

/*
 * Return statistics about a thread/transaction.
 */
int stm_get_stats(TXPARAMS const char *name, void *val)
{
  TX_GET;

  if (strcmp("read_set_size", name) == 0) {
    *(unsigned int *)val = tx->r_set.size;
    return 1;
  }
  if (strcmp("write_set_size", name) == 0) {
    *(unsigned int *)val = tx->w_set.size;
    return 1;
  }
  if (strcmp("read_set_nb_entries", name) == 0) {
    *(unsigned int *)val = tx->r_set.nb_entries;
    return 1;
  }
  if (strcmp("write_set_nb_entries", name) == 0) {
    *(unsigned int *)val = tx->w_set.nb_entries;
    return 1;
  }
  if (strcmp("read_only", name) == 0) {
    *(unsigned int *)val = tx->ro;
    return 1;
  }
  if (strcmp("nb_aborts", name) == 0) {
    *(unsigned long *)val = tx->aborts;
    return 1;
  }
  if (strcmp("nb_aborts_ro", name) == 0) {
    *(unsigned long *)val = tx->aborts_ro;
    return 1;
  }
  if (strcmp("nb_aborts_locked_read", name) == 0) {
    *(unsigned long *)val = tx->aborts_locked_read;
    return 1;
  }
  if (strcmp("nb_aborts_locked_write", name) == 0) {
    *(unsigned long *)val = tx->aborts_locked_write;
    return 1;
  }
  if (strcmp("nb_aborts_validate_read", name) == 0) {
    *(unsigned long *)val = tx->aborts_validate_read;
    return 1;
  }
  if (strcmp("nb_aborts_validate_write", name) == 0) {
    *(unsigned long *)val = tx->aborts_validate_write;
    return 1;
  }
  if (strcmp("nb_aborts_validate_commit", name) == 0) {
    *(unsigned long *)val = tx->aborts_validate_commit;
    return 1;
  }
  if (strcmp("nb_aborts_invalid_memory", name) == 0) {
    *(unsigned long *)val = tx->aborts_invalid_memory;
    return 1;
  }
  if (strcmp("nb_aborts_double_write", name) == 0) {
    *(unsigned long *)val = tx->aborts_double_write;
    return 1;
  }
  if (strcmp("nb_aborts_reallocate", name) == 0) {
    *(unsigned long *)val = tx->aborts_reallocate;
    return 1;
  } 
  if (strcmp("nb_aborts_rollover", name) == 0) {
    *(unsigned long *)val = tx->aborts_rollover;
    return 1;
  }
  if (strcmp("max_retries", name) == 0) {
    *(unsigned long *)val = tx->max_retries;
    return 1;
  }
  return 0;
}

/*
 * Return STM parameters.
 */
int stm_get_parameter(const char *name, void *val)
{
  if (strcmp("contention_manager", name) == 0) {
    *(const char **)val = 0;
    return 1;
  }
  if (strcmp("design", name) == 0) {
    *(const char **)val = 0;
    return 1;
  }
  if (strcmp("initial_rw_set_size", name) == 0) {
    *(int *)val = RW_SET_SIZE;
    return 1;
  }
#ifdef COMPILE_FLAGS
  if (strcmp("compile_flags", name) == 0) {
    *(const char **)val = XSTR(COMPILE_FLAGS);
    return 1;
  }
#endif /* COMPILE_FLAGS */
  return 0;
}

/*
 * Set STM parameters.
 */
int stm_set_parameter(const char *name, void *val)
{
  return 0;
}

/*
 * Create transaction-specific data (return -1 on error).
 */
int stm_create_specific()
{
  if (nb_specific >= MAX_SPECIFIC) {
    fprintf(stderr, "Error: maximum number of specific slots reached\n");
    return -1;
  }
  return nb_specific++;
}

/*
 * Store transaction-specific data.
 */
void stm_set_specific(TXPARAMS int key, void *data)
{
  TX_GET;

  assert (key >= 0 && key < nb_specific);
  tx->data[key] = data;
}

/*
 * Fetch transaction-specific data.
 */
void *stm_get_specific(TXPARAMS int key)
{
  TX_GET;

  assert (key >= 0 && key < nb_specific);
  return tx->data[key];
}

/*
 * Register callbacks for an external module (must be called before creating transactions).
 */
int stm_register(void (*on_thread_init)(TXPARAMS void *arg),
                 void (*on_thread_exit)(TXPARAMS void *arg),
                 void (*on_start)(TXPARAMS void *arg),
                 void (*on_commit)(TXPARAMS void *arg),
                 void (*on_abort)(TXPARAMS void *arg),
                 void *arg)
{
  if ((on_thread_init != NULL && nb_init_cb >= MAX_CB) ||
      (on_thread_exit != NULL && nb_exit_cb >= MAX_CB) ||
      (on_start != NULL && nb_start_cb >= MAX_CB) ||
      (on_commit != NULL && nb_commit_cb >= MAX_CB) ||
      (on_abort != NULL && nb_abort_cb >= MAX_CB)) {
    fprintf(stderr, "Error: maximum number of modules reached\n");
    return 0;
  }
  /* New callback */
  if (on_thread_init != NULL) {
    init_cb[nb_init_cb].f = on_thread_init;
    init_cb[nb_init_cb++].arg = arg;
  }
  /* Delete callback */
  if (on_thread_exit != NULL) {
    exit_cb[nb_exit_cb].f = on_thread_exit;
    exit_cb[nb_exit_cb++].arg = arg;
  }
  /* Start callback */
  if (on_start != NULL) {
    start_cb[nb_start_cb].f = on_start;
    start_cb[nb_start_cb++].arg = arg;
  }
  /* Commit callback */
  if (on_commit != NULL) {
    commit_cb[nb_commit_cb].f = on_commit;
    commit_cb[nb_commit_cb++].arg = arg;
  }
  /* Abort callback */
  if (on_abort != NULL) {
    abort_cb[nb_abort_cb].f = on_abort;
    abort_cb[nb_abort_cb++].arg = arg;
  }

  return 1;
}

/*
 * Get curent value of global clock.
 */
stm_word_t stm_get_clock()
{
  return GET_CLOCK;
}

/*
 * Get current transaction descriptor.
 */
stm_tx_t *stm_current_tx()
{
  return stm_get_tx();
}


================================================
FILE: stms/estm-0.3.0/src/wrappers.c
================================================
/*
 * File:
 *   wrappers.c
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 * Description:
 *   STM wrapper functions for different data types.
 *
 * Copyright (c) 2007-2009.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */

#include <assert.h>

#include "wrappers.h"

#define COMPILE_TIME_ASSERT(pred)       switch (0) { case 0: case pred: ; }

typedef union convert_64 {
  uint64_t u64;
  uint32_t u32[2];
  uint16_t u16[4];
  uint8_t u8[8];
  int64_t s64;
  double d;
} convert_64_t;

typedef union convert_32 {
  uint32_t u32;
  uint16_t u16[2];
  uint8_t u8[4];
  int32_t s32;
  float f;
} convert_32_t;

typedef union convert_16 {
  uint16_t u16;
  int16_t s16;
} convert_16_t;

typedef union convert_8 {
  uint8_t u8;
  int8_t s8;
} convert_8_t;

typedef union convert {
  stm_word_t w;
  uint8_t b[sizeof(stm_word_t)];
} convert_t;

/* ################################################################### *
 * LOADS
 * ################################################################### */

uint8_t stm_load8(TXPARAMS volatile uint8_t *addr)
{
  if (sizeof(stm_word_t) == 4) {
    convert_32_t val;
    val.u32 = (uint32_t)stm_load(TXARGS (volatile stm_word_t *)((uintptr_t)addr & ~(uintptr_t)0x03));
    return val.u8[(uintptr_t)addr & 0x03];
  } else {
    convert_64_t val;
    val.u64 = (uint64_t)stm_load(TXARGS (volatile stm_word_t *)((uintptr_t)addr & ~(uintptr_t)0x07));
    return val.u8[(uintptr_t)addr & 0x07];
  }
}

uint16_t stm_load16(TXPARAMS volatile uint16_t *addr)
{
  if (((uintptr_t)addr & 0x01) != 0) {
    uint16_t val;
    stm_load_bytes(TXARGS (volatile uint8_t *)addr, (uint8_t *)&val, sizeof(uint16_t));
    return val;
  } else if (sizeof(stm_word_t) == 4) {
    convert_32_t val;
    val.u32 = (uint32_t)stm_load(TXARGS (volatile stm_word_t *)((uintptr_t)addr & ~(uintptr_t)0x03));
    return val.u16[((uintptr_t)addr & 0x03) >> 1];
  } else {
    convert_64_t val;
    val.u64 = (uint64_t)stm_load(TXARGS (volatile stm_word_t *)((uintptr_t)addr & ~(uintptr_t)0x07));
    return val.u16[((uintptr_t)addr & 0x07) >> 1];
  }
}

uint32_t stm_load32(TXPARAMS volatile uint32_t *addr)
{
  if (((uintptr_t)addr & 0x03) != 0) {
    uint32_t val;
    stm_load_bytes(TXARGS (volatile uint8_t *)addr, (uint8_t *)&val, sizeof(uint32_t));
    return val;
  } else if (sizeof(stm_word_t) == 4) {
    return (uint32_t)stm_load(TXARGS (volatile stm_word_t *)addr);
  } else {
    convert_64_t val;
    val.u64 = (uint64_t)stm_load(TXARGS (volatile stm_word_t *)((uintptr_t)addr & ~(uintptr_t)0x07));
    return val.u32[((uintptr_t)addr & 0x07) >> 2];
  }
}

uint64_t stm_load64(TXPARAMS volatile uint64_t *addr)
{
  if (((uintptr_t)addr & 0x07) != 0) {
    uint64_t val;
    stm_load_bytes(TXARGS (volatile uint8_t *)addr, (uint8_t *)&val, sizeof(uint64_t));
    return val;
  } else if (sizeof(stm_word_t) == 4) {
    convert_64_t val;
    val.u32[0] = (uint32_t)stm_load(TXARGS (volatile stm_word_t *)addr);
    val.u32[1] = (uint32_t)stm_load(TXARGS (volatile stm_word_t *)addr + 1);
    return val.u64;
  } else {
    return (uint64_t)stm_load(TXARGS (volatile stm_word_t *)addr);
  }
}

char stm_load_char(TXPARAMS volatile char *addr)
{
  convert_8_t val;
  val.u8 = stm_load8(TXARGS (volatile uint8_t *)addr);
  return val.s8;
}

unsigned char stm_load_uchar(TXPARAMS volatile unsigned char *addr)
{
  return (unsigned char)stm_load8(TXARGS (volatile uint8_t *)addr);
}

short stm_load_short(TXPARAMS volatile short *addr)
{
  convert_16_t val;
  val.u16 = stm_load16(TXARGS (volatile uint16_t *)addr);
  return val.s16;
}

unsigned short stm_load_ushort(TXPARAMS volatile unsigned short *addr)
{
  return (unsigned short)stm_load16(TXARGS (volatile uint16_t *)addr);
}

int stm_load_int(TXPARAMS volatile int *addr)
{
  convert_32_t val;
  val.u32 = stm_load32(TXARGS (volatile uint32_t *)addr);
  return val.s32;
}

unsigned int stm_load_uint(TXPARAMS volatile unsigned int *addr)
{
  return (unsigned int)stm_load32(TXARGS (volatile uint32_t *)addr);
}

long stm_load_long(TXPARAMS volatile long *addr)
{
  if (sizeof(long) == 4) {
    convert_32_t val;
    val.u32 = stm_load32(TXARGS (volatile uint32_t *)addr);
    return val.s32;
  } else {
    convert_64_t val;
    val.u64 = stm_load64(TXARGS (volatile uint64_t *)addr);
    return val.s64;
  }
}

unsigned long stm_load_ulong(TXPARAMS volatile unsigned long *addr)
{
  if (sizeof(long) == 4) {
    return (unsigned long)stm_load32(TXARGS (volatile uint32_t *)addr);
  } else {
    return (unsigned long)stm_load64(TXARGS (volatile uint64_t *)addr);
  }
}

float stm_load_float(TXPARAMS volatile float *addr)
{
  convert_32_t val;
  val.u32 = stm_load32(TXARGS (volatile uint32_t *)addr);
  return val.f;
}

double stm_load_double(TXPARAMS volatile double *addr)
{
  convert_64_t val;
  val.u64 = stm_load64(TXARGS (volatile uint64_t *)addr);
  return val.d;
}

void *stm_load_ptr(TXPARAMS volatile void **addr)
{
  union { stm_word_t w; void *v; } convert;
  convert.w = stm_load(TXARGS (stm_word_t *)addr);
  return convert.v;
}

void stm_load_bytes(TXPARAMS volatile uint8_t *addr, uint8_t *buf, size_t size)
{
  convert_t val;
  unsigned int i;
  stm_word_t *a;

  if (size == 0)
    return;
  i = (uintptr_t)addr & (sizeof(stm_word_t) - 1);
  if (i != 0) {
    /* First bytes */
    a = (stm_word_t *)((uintptr_t)addr & ~(uintptr_t)(sizeof(stm_word_t) - 1));
    val.w = stm_load(TXARGS a++);
    for (; i < sizeof(stm_word_t) && size > 0; i++, size--)
      *buf++ = val.b[i];
  } else
    a = (stm_word_t *)addr;
  /* Full words */
  while (size >= sizeof(stm_word_t)) {
#ifdef ALLOW_MISALIGNED_ACCESSES
    *((stm_word_t *)buf) = stm_load(TXARGS a++);
    buf += sizeof(stm_word_t);
#else /* ! ALLOW_MISALIGNED_ACCESSES */
    val.w = stm_load(TXARGS a++);
    for (i = 0; i < sizeof(stm_word_t); i++)
      *buf++ = val.b[i];
#endif /* ! ALLOW_MISALIGNED_ACCESSES */
    size -= sizeof(stm_word_t);
  }
  if (size > 0) {
    /* Last bytes */
    val.w = stm_load(TXARGS a);
    i = 0;
    for (i = 0; size > 0; i++, size--)
      *buf++ = val.b[i];
  }
}

/* ################################################################### *
 * STORES
 * ################################################################### */

void stm_store8(TXPARAMS volatile uint8_t *addr, uint8_t value)
{
  if (sizeof(stm_word_t) == 4) {
    convert_32_t val, mask;
    val.u8[(uintptr_t)addr & 0x03] = value;
    mask.u32 = 0;
    mask.u8[(uintptr_t)addr & 0x03] = ~(uint8_t)0;
    stm_store2(TXARGS (volatile stm_word_t *)((uintptr_t)addr & ~(uintptr_t)0x03), (stm_word_t)val.u32, (stm_word_t)mask.u32);
  } else {
    convert_64_t val, mask;
    val.u8[(uintptr_t)addr & 0x07] = value;
    mask.u64 = 0;
    mask.u8[(uintptr_t)addr & 0x07] = ~(uint8_t)0;
    stm_store2(TXARGS (volatile stm_word_t *)((uintptr_t)addr & ~(uintptr_t)0x07), (stm_word_t)val.u64, (stm_word_t)mask.u64);
  }
}

void stm_store16(TXPARAMS volatile uint16_t *addr, uint16_t value)
{
  if (((uintptr_t)addr & 0x01) != 0) {
    stm_store_bytes(TXARGS (volatile uint8_t *)addr, (uint8_t *)&value, sizeof(uint16_t));
  } else if (sizeof(stm_word_t) == 4) {
    convert_32_t val, mask;
    val.u16[((uintptr_t)addr & 0x03) >> 1] = value;
    mask.u32 = 0;
    mask.u16[((uintptr_t)addr & 0x03) >> 1] = ~(uint16_t)0;
    stm_store2(TXARGS (volatile stm_word_t *)((uintptr_t)addr & ~(uintptr_t)0x03), (stm_word_t)val.u32, (stm_word_t)mask.u32);
  } else {
    convert_64_t val, mask;
    val.u16[((uintptr_t)addr & 0x07) >> 1] = value;
    mask.u64 = 0;
    mask.u16[((uintptr_t)addr & 0x07) >> 1] = ~(uint16_t)0;
    stm_store2(TXARGS (volatile stm_word_t *)((uintptr_t)addr & ~(uintptr_t)0x07), (stm_word_t)val.u64, (stm_word_t)mask.u64);
  }
}

void stm_store32(TXPARAMS volatile uint32_t *addr, uint32_t value)
{
  if (((uintptr_t)addr & 0x03) != 0) {
    stm_store_bytes(TXARGS (volatile uint8_t *)addr, (uint8_t *)&value, sizeof(uint32_t));
  } else if (sizeof(stm_word_t) == 4) {
    stm_store(TXARGS (volatile stm_word_t *)addr, (stm_word_t)value);
  } else {
    convert_64_t val, mask;
    val.u32[((uintptr_t)addr & 0x07) >> 2] = value;
    mask.u64 = 0;
    mask.u32[((uintptr_t)addr & 0x07) >> 2] = ~(uint32_t)0;
    stm_store2(TXARGS (volatile stm_word_t *)((uintptr_t)addr & ~(uintptr_t)0x07), (stm_word_t)val.u64, (stm_word_t)mask.u64);
  }
}

void stm_store64(TXPARAMS volatile uint64_t *addr, uint64_t value)
{
  if (((uintptr_t)addr & 0x07) != 0) {
    stm_store_bytes(TXARGS (volatile uint8_t *)addr, (uint8_t *)&value, sizeof(uint64_t));
  } else if (sizeof(stm_word_t) == 4) {
    convert_64_t val;
    val.u64 = value;
    stm_store(TXARGS (volatile stm_word_t *)addr, (stm_word_t)val.u32[0]);
    stm_store(TXARGS (volatile stm_word_t *)addr + 1, (stm_word_t)val.u32[1]);
  } else {
    return stm_store(TXARGS (volatile stm_word_t *)addr, (stm_word_t)value);
  }
}

void stm_store_char(TXPARAMS volatile char *addr, char value)
{
  convert_8_t val;
  val.s8 = value;
  stm_store8(TXARGS (volatile uint8_t *)addr, val.u8);
}

void stm_store_uchar(TXPARAMS volatile unsigned char *addr, unsigned char value)
{
  stm_store8(TXARGS (volatile uint8_t *)addr, (uint8_t)value);
}

void stm_store_short(TXPARAMS volatile short *addr, short value)
{
  convert_16_t val;
  val.s16 = value;
  stm_store16(TXARGS (volatile uint16_t *)addr, val.u16);
}

void stm_store_ushort(TXPARAMS volatile unsigned short *addr, unsigned short value)
{
  stm_store16(TXARGS (volatile uint16_t *)addr, (uint16_t)value);
}

void stm_store_int(TXPARAMS volatile int *addr, int value)
{
  convert_32_t val;
  val.s32 = value;
  stm_store32(TXARGS (volatile uint32_t *)addr, val.u32);
}

void stm_store_uint(TXPARAMS volatile unsigned int *addr, unsigned int value)
{
  stm_store32(TXARGS (volatile uint32_t *)addr, (uint32_t)value);
}

void stm_store_long(TXPARAMS volatile long *addr, long value)
{
  if (sizeof(long) == 4) {
    convert_32_t val;
    val.s32 = value;
    stm_store32(TXARGS (volatile uint32_t *)addr, val.u32);
  } else {
    convert_64_t val;
    val.s64 = value;
    stm_store64(TXARGS (volatile uint64_t *)addr, val.u64);
  }
}

void stm_store_ulong(TXPARAMS volatile unsigned long *addr, unsigned long value)
{
  if (sizeof(long) == 4) {
    stm_store32(TXARGS (volatile uint32_t *)addr, (uint32_t)value);
  } else {
    stm_store64(TXARGS (volatile uint64_t *)addr, (uint64_t)value);
  }
}

void stm_store_float(TXPARAMS volatile float *addr, float value)
{
  convert_32_t val;
  val.f = value;
  stm_store32(TXARGS (volatile uint32_t *)addr, val.u32);
}

void stm_store_double(TXPARAMS volatile double *addr, double value)
{
  convert_64_t val;
  val.d = value;
  stm_store64(TXARGS (volatile uint64_t *)addr, val.u64);
}

void stm_store_ptr(TXPARAMS volatile void **addr, void *value)
{
  union { stm_word_t w; void *v; } convert;
  convert.v = value;
  stm_store(TXARGS (stm_word_t *)addr, convert.w);
}

void stm_store_bytes(TXPARAMS volatile uint8_t *addr, uint8_t *buf, size_t size)
{
  convert_t val, mask;
  unsigned int i;
  stm_word_t *a;

  if (size == 0)
    return;
  i = (uintptr_t)addr & (sizeof(stm_word_t) - 1);
  if (i != 0) {
    /* First bytes */
    a = (stm_word_t *)((uintptr_t)addr & ~(uintptr_t)(sizeof(stm_word_t) - 1));
    val.w = mask.w = 0;
    for (; i < sizeof(stm_word_t) && size > 0; i++, size--) {
      mask.b[i] = 0xFF;
      val.b[i] = *buf++;
    }
    stm_store2(TXARGS a++, val.w, mask.w);
  } else
    a = (stm_word_t *)addr;
  /* Full words */
  while (size >= sizeof(stm_word_t)) {
#ifdef ALLOW_MISALIGNED_ACCESSES
    stm_store(TXARGS a++, *((stm_word_t *)buf));
    buf += sizeof(stm_word_t);
#else /* ! ALLOW_MISALIGNED_ACCESSES */
    for (i = 0; i < sizeof(stm_word_t); i++)
      val.b[i] = *buf++;
    stm_store(TXARGS a++, val.w);
#endif /* ! ALLOW_MISALIGNED_ACCESSES */
    size -= sizeof(stm_word_t);
  }
  if (size > 0) {
    /* Last bytes */
    val.w = mask.w = 0;
    for (i = 0; size > 0; i++, size--) {
      mask.b[i] = 0xFF;
      val.b[i] = *buf++;
    }
    stm_store2(TXARGS a, val.w, mask.w);
  }
}


================================================
FILE: stms/tinystm/ChangeLog
================================================
[unreleased]

2014-12-01  Patrick Marlier <patrick.marlier@unine.ch>
	* Remove hybrid implementation (ASF)

[1.0.5 release]

2013-01-14  Pascal Felber  <pascal.felber@unine.ch>
	* Fixed bugs in garbage collection.
	* Added module that for to commit in order (mod_order)
	* Added stm_get_specific_tx and stm_set_specific_tx for mod_order.

[1.0.4 release]

2012-10-23  Pascal Felber  <pascal.felber@unine.ch>
	* Added MIT license (dual license).

2012-03-27  Patrick Marlier <patrick.marlier@unine.ch>
	* Split stm.c file for the different strategies (wb-etl/wb-ctl/wt).

[1.0.3 release]

2011-11-17  Patrick Marlier <patrick.marlier@unine.ch>
	* Fixed stack saving and minor changes for DTMC (tanger).

2011-10-20  Patrick Marlier <patrick.marlier@unine.ch>
	* Fixed a bug in tanger_stm_realloc (reported by Luke Dalessandro).

2011-02-15  Patrick Marlier <patrick.marlier@unine.ch>
	* Fixed a bug in stm_rollback (reported by Rahul Gayatri).

[1.0.2 release]

2011-02-07  Patrick Marlier <patrick.marlier@unine.ch>
	* Minor fixes.
	* Added stm_get_attribute_tx() for Event Processing.
	* Added transactional_safe new/delete operators for GCC-TM.

2010-08-10  Patrick Marlier <patrick.marlier@unine.ch>
	* ABI uses WRITE_THROUGH by default.
	* Added irrevocability management for hybrid ASF mode.
	* Disable inlining for tanger load/store due to a LLVM bug.
	* Fixed a bug in gc.c (thanks to Walther Maldonado).
	* Fixed a bug in mod_log.c (thanks to Adam Cozzette).

2010-07-08  Patrick Marlier <patrick.marlier@unine.ch>
	* Fixed a bug in stm_commit with CTL (thanks to Srđan Stipić).

2010-07-07  Pascal Felber  <pascal.felber@unine.ch>
	* Fixed a bug in stm_set_irrevoable when using abort handler.

[1.0.1 release]

2010-06-02  Patrick Marlier <patrick.marlier@unine.ch>
	* Added Hybrid TM using AMD ASF.
	* Added ABI compatability with Intel STM C Compiler, GCC with TM
	support and DTMC.
	* Removed outdated Tanger support (now DTMC in ABI).

2010-04-29  Pascal Felber  <pascal.felber@unine.ch>
	* Added mod_ab module (atomic block statistics).

[1.0.0 release]

2010-02-22  Pascal Felber  <pascal.felber@unine.ch>
	* Various fixes and cosmetic changes.

2010-01-15  Pascal Felber  <pascal.felber@unine.ch>
	* Changed rollover code.
	* Added quiescence and serial-irrevocable mode.

2009-12-18  Pascal Felber  <pascal.felber@unine.ch>
	* Added commit/abort callbacks.
	* Various fixes.

2009-12-15  Pascal Felber  <pascal.felber@unine.ch>
	* Updated irrevocability to work with modular contention manager.

2009-11-29  Pascal Felber  <pascal.felber@unine.ch>
	* Updated modules to avoid name conflicts.

2009-11-20  Pascal Felber  <pascal.felber@unine.ch>
	* Changed signature of stm_start().
	* Added irrevocability.
	* Added ABI for intel compiler and tm-gcc.
	* Fixed a bug in mod_log.

2009-09-25  Pascal Felber  <pascal.felber@unine.ch>
	* Can disable alternation of insertion/removal in intset.

2009-06-10  Pascal Felber  <pascal.felber@unine.ch>
	* Added customizable (modular) contention manager.

2009-03-08  Pascal Felber  <pascal.felber@unine.ch>
	* Added some more statistics.
	* Added call to conflict callback upon validation.

2009-02-06  Pascal Felber  <pascal.felber@unine.ch>
	* Added version number.
	* Renamed mod_local module to mod_log and updated code.

2009-02-06  Pascal Felber  <pascal.felber@unine.ch>
	* Added abort reason as parameter of siglongjmp().

[0.9.9 release]

2009-01-29  Pascal Felber  <pascal.felber@unine.ch>
	* Updated TANGER wrappers.
	* Prepared new release.

2009-01-17  Pascal Felber  <pascal.felber@unine.ch>
	* Added content of README as doxygen documentation.
	* Added new version of TANGER wrappers (partial).
	* Added region load/store and support for misaligned accesses.
	* Bug fixes.

2009-01-04  Pascal Felber  <pascal.felber@unine.ch>
	* Documented APIs (doxygen comments in include files).
	* Cleaned up some APIs, changed parameters of stm_start().
	* CTL now spins while address is locked.

2008-12-06  Pascal Felber  <pascal.felber@unine.ch>
	* Cleaned up atomic operations and memory barriers.

2008-11-26  Pascal Felber  <pascal.felber@unine.ch>
	* Documented compile-time options in Makefile

2008-11-16  Pascal Felber  <pascal.felber@unine.ch>
	* Added support to read previous version of locked data.
	* Added support in Makefile for unifdef.

2008-07-22  Pascal Felber  <pascal.felber@unine.ch>
	* Added epoch-based memory allocator.
	* Added conflict tracking and callback.
	* Added skip list and refactored intset benchmark
	* Added mod_local module.
	* Added functions to control extensions and read the clock.
	* Added support for unit transactions in intset.
	* Many other changes and fixes.

[0.9.5 release]

2008-07-07  Pascal Felber  <pascal.felber@unine.ch>
	* Added support for the latest version of STAMP.
	* Removed TANGER wrappers (distributed separately).
	* Many modifications and fixes.

2008-05-26  Pascal Felber  <pascal.felber@unine.ch>
	* Added CTL.

2008-05-20  Pascal Felber  <pascal.felber@unine.ch>
	* Major API changes (no more transaction parameters, etc.).
	* Added unit transactions.
	* Many other changes and fixes.

[0.9.0 release]

2008-04-25  Pascal Felber  <pascal.felber@unine.ch>
	* Removed parameter to stm_new() => transaction cannot be on stack.

2008-03-27  Pascal Felber  <pascal.felber@unine.ch>
	* Added new module for coarse-grain statistics.
	* Added new internal statistics.
	* Added new support for reading configuration parameters.

2008-03-25  Pascal Felber  <pascal.felber@unine.ch>
	* Merged write-back and write-through in a single file.

2008-03-16  Pascal Felber  <pascal.felber@unine.ch>
	* Refactored dynamic memory management as a module.

2008-03-15  Pascal Felber  <pascal.felber@unine.ch>
	* Added support for modules.
	* Added sample mod_print module.

2008-02-13  Pascal Felber  <pascal.felber@unine.ch>
	* Improvements to PRIORITY contention manager.

2008-01-21  Pascal Felber  <pascal.felber@unine.ch>
	* Added BACKOFF contention manager.

2008-01-03  Pascal Felber  <pascal.felber@unine.ch>
	* Added PRIORITY contention manager.
	* Added DELAY contention manager.
	* Added bank benchmark.

2008-01-01  Pascal Felber  <pascal.felber@unine.ch>
	* Initial code base for the VELOX project.


================================================
FILE: stms/tinystm/Doxyfile
================================================
# Doxyfile 1.8.1.2

# This file describes the settings to be used by the documentation system
# doxygen (www.doxygen.org) for a project.
#
# All text after a hash (#) is considered a comment and will be ignored.
# The format is:
#       TAG = value [value, ...]
# For lists items can also be appended using:
#       TAG += value [value, ...]
# Values that contain spaces should be placed between quotes (" ").

#---------------------------------------------------------------------------
# Project related configuration options
#---------------------------------------------------------------------------

# This tag specifies the encoding used for all characters in the config file
# that follow. The default is UTF-8 which is also the encoding used for all
# text before the first occurrence of this tag. Doxygen uses libiconv (or the
# iconv built into libc) for the transcoding. See
# http://www.gnu.org/software/libiconv for the list of possible encodings.

DOXYFILE_ENCODING      = UTF-8

# The PROJECT_NAME tag is a single word (or sequence of words) that should
# identify the project. Note that if you do not use Doxywizard you need
# to put quotes around the project name if it contains spaces.

PROJECT_NAME           = TinySTM

# The PROJECT_NUMBER tag can be used to enter a project or revision number.
# This could be handy for archiving the generated documentation or
# if some version control system is used.

PROJECT_NUMBER         = 1.0.6

# Using the PROJECT_BRIEF tag one can provide an optional one line description
# for a project that appears at the top of each page and should give viewer
# a quick idea about the purpose of the project. Keep the description short.

PROJECT_BRIEF          =

# With the PROJECT_LOGO tag one can specify an logo or icon that is
# included in the documentation. The maximum height of the logo should not
# exceed 55 pixels and the maximum width should not exceed 200 pixels.
# Doxygen will copy the logo to the output directory.

PROJECT_LOGO           =

# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
# base path where the generated documentation will be put.
# If a relative path is entered, it will be relative to the location
# where doxygen was started. If left blank the current directory will be used.

OUTPUT_DIRECTORY       = doc

# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create
# 4096 sub-directories (in 2 levels) under the output directory of each output
# format and will distribute the generated files over these directories.
# Enabling this option can be useful when feeding doxygen a huge amount of
# source files, where putting all generated files in the same directory would
# otherwise cause performance problems for the file system.

CREATE_SUBDIRS         = NO

# The OUTPUT_LANGUAGE tag is used to specify the language in which all
# documentation generated by doxygen is written. Doxygen will use this
# information to generate all constant output in the proper language.
# The default language is English, other supported languages are:
# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional,
# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German,
# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English
# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian,
# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak,
# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese.

OUTPUT_LANGUAGE        = English

# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will
# include brief member descriptions after the members that are listed in
# the file and class documentation (similar to JavaDoc).
# Set to NO to disable this.

BRIEF_MEMBER_DESC      = YES

# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend
# the brief description of a member or function before the detailed description.
# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
# brief descriptions will be completely suppressed.

REPEAT_BRIEF           = YES

# This tag implements a quasi-intelligent brief description abbreviator
# that is used to form the text in various listings. Each string
# in this list, if found as the leading text of the brief description, will be
# stripped from the text and the result after processing the whole list, is
# used as the annotated text. Otherwise, the brief description is used as-is.
# If left blank, the following values are used ("$name" is automatically
# replaced with the name of the entity): "The $name class" "The $name widget"
# "The $name file" "is" "provides" "specifies" "contains"
# "represents" "a" "an" "the"

ABBREVIATE_BRIEF       =

# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
# Doxygen will generate a detailed section even if there is only a brief
# description.

ALWAYS_DETAILED_SEC    = NO

# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
# inherited members of a class in the documentation of that class as if those
# members were ordinary class members. Constructors, destructors and assignment
# operators of the base classes will not be shown.

INLINE_INHERITED_MEMB  = NO

# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full
# path before files name in the file list and in the header files. If set
# to NO the shortest path that makes the file name unique will be used.

FULL_PATH_NAMES        = NO

# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag
# can be used to strip a user-defined part of the path. Stripping is
# only done if one of the specified strings matches the left-hand part of
# the path. The tag can be used to show relative paths in the file list.
# If left blank the directory from which doxygen is run is used as the
# path to strip.

STRIP_FROM_PATH        =

# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of
# the path mentioned in the documentation of a class, which tells
# the reader which header file to include in order to use a class.
# If left blank only the name of the header file containing the class
# definition is used. Otherwise one should specify the include paths that
# are normally passed to the compiler using the -I flag.

STRIP_FROM_INC_PATH    =

# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter
# (but less readable) file names. This can be useful if your file system
# doesn't support long names like on DOS, Mac, or CD-ROM.

SHORT_NAMES            = NO

# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen
# will interpret the first line (until the first dot) of a JavaDoc-style
# comment as the brief description. If set to NO, the JavaDoc
# comments will behave just like regular Qt-style comments
# (thus requiring an explicit @brief command for a brief description.)

JAVADOC_AUTOBRIEF      = YES

# If the QT_AUTOBRIEF tag is set to YES then Doxygen will
# interpret the first line (until the first dot) of a Qt-style
# comment as the brief description. If set to NO, the comments
# will behave just like regular Qt-style comments (thus requiring
# an explicit \brief command for a brief description.)

QT_AUTOBRIEF           = NO

# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen
# treat a multi-line C++ special comment block (i.e. a block of //! or ///
# comments) as a brief description. This used to be the default behaviour.
# The new default is to treat a multi-line C++ comment block as a detailed
# description. Set this tag to YES if you prefer the old behaviour instead.

MULTILINE_CPP_IS_BRIEF = NO

# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented
# member inherits the documentation from any documented member that it
# re-implements.

INHERIT_DOCS           = YES

# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce
# a new page for each member. If set to NO, the documentation of a member will
# be part of the file/class/namespace that contains it.

SEPARATE_MEMBER_PAGES  = NO

# The TAB_SIZE tag can be used to set the number of spaces in a tab.
# Doxygen uses this value to replace tabs by spaces in code fragments.

TAB_SIZE               = 8

# This tag can be used to specify a number of aliases that acts
# as commands in the documentation. An alias has the form "name=value".
# For example adding "sideeffect=\par Side Effects:\n" will allow you to
# put the command \sideeffect (or @sideeffect) in the documentation, which
# will result in a user-defined paragraph with heading "Side Effects:".
# You can put \n's in the value part of an alias to insert newlines.

ALIASES                =

# This tag can be used to specify a number of word-keyword mappings (TCL only).
# A mapping has the form "name=value". For example adding
# "class=itcl::class" will allow you to use the command class in the
# itcl::class meaning.

TCL_SUBST              =

# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C
# sources only. Doxygen will then generate output that is more tailored for C.
# For instance, some of the names that are used will be different. The list
# of all members will be omitted, etc.

OPTIMIZE_OUTPUT_FOR_C  = YES

# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java
# sources only. Doxygen will then generate output that is more tailored for
# Java. For instance, namespaces will be presented as packages, qualified
# scopes will look different, etc.

OPTIMIZE_OUTPUT_JAVA   = NO

# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
# sources only. Doxygen will then generate output that is more tailored for
# Fortran.

OPTIMIZE_FOR_FORTRAN   = NO

# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
# sources. Doxygen will then generate output that is tailored for
# VHDL.

OPTIMIZE_OUTPUT_VHDL   = NO

# Doxygen selects the parser to use depending on the extension of the files it
# parses. With this tag you can assign which parser to use for a given extension.
# Doxygen has a built-in mapping, but you can override or extend it using this
# tag. The format is ext=language, where ext is a file extension, and language
# is one of the parsers supported by doxygen: IDL, Java, Javascript, CSharp, C,
# C++, D, PHP, Objective-C, Python, Fortran, VHDL, C, C++. For instance to make
# doxygen treat .inc files as Fortran files (default is PHP), and .f files as C
# (default is Fortran), use: inc=Fortran f=C. Note that for custom extensions
# you also need to set FILE_PATTERNS otherwise the files are not read by doxygen.

EXTENSION_MAPPING      =

# If MARKDOWN_SUPPORT is enabled (the default) then doxygen pre-processes all
# comments according to the Markdown format, which allows for more readable
# documentation. See http://daringfireball.net/projects/markdown/ for details.
# The output of markdown processing is further processed by doxygen, so you
# can mix doxygen, HTML, and XML commands with Markdown formatting.
# Disable only in case of backward compatibilities issues.

MARKDOWN_SUPPORT       = YES

# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
# to include (a tag file for) the STL sources as input, then you should
# set this tag to YES in order to let doxygen match functions declarations and
# definitions whose arguments contain STL classes (e.g. func(std::string); v.s.
# func(std::string) {}). This also makes the inheritance and collaboration
# diagrams that involve STL classes more complete and accurate.

BUILTIN_STL_SUPPORT    = NO

# If you use Microsoft's C++/CLI language, you should set this option to YES to
# enable parsing support.

CPP_CLI_SUPPORT        = NO

# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only.
# Doxygen will parse them like normal C++ but will assume all classes use public
# instead of private inheritance when no explicit protection keyword is present.

SIP_SUPPORT            = NO

# For Microsoft's IDL there are propget and propput attributes to indicate getter
# and setter methods for a property. Setting this option to YES (the default)
# will make doxygen replace the get and set methods by a property in the
# documentation. This will only work if the methods are indeed getting or
# setting a simple type. If this is not the case, or you want to show the
# methods anyway, you should set this option to NO.

IDL_PROPERTY_SUPPORT   = YES

# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
# tag is set to YES, then doxygen will reuse the documentation of the first
# member in the group (if any) for the other members of the group. By default
# all members of a group must be documented explicitly.

DISTRIBUTE_GROUP_DOC   = NO

# Set the SUBGROUPING tag to YES (the default) to allow class member groups of
# the same type (for instance a group of public functions) to be put as a
# subgroup of that type (e.g. under the Public Functions section). Set it to
# NO to prevent subgrouping. Alternatively, this can be done per class using
# the \nosubgrouping command.

SUBGROUPING            = YES

# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and
# unions are shown inside the group in which they are included (e.g. using
# @ingroup) instead of on a separate page (for HTML and Man pages) or
# section (for LaTeX and RTF).

INLINE_GROUPED_CLASSES = NO

# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and
# unions with only public data fields will be shown inline in the documentation
# of the scope in which they are defined (i.e. file, namespace, or group
# documentation), provided this scope is documented. If set to NO (the default),
# structs, classes, and unions are shown on a separate page (for HTML and Man
# pages) or section (for LaTeX and RTF).

INLINE_SIMPLE_STRUCTS  = NO

# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum
# is documented as struct, union, or enum with the name of the typedef. So
# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
# with name TypeT. When disabled the typedef will appear as a member of a file,
# namespace, or class. And the struct will be named TypeS. This can typically
# be useful for C code in case the coding convention dictates that all compound
# types are typedef'ed and only the typedef is referenced, never the tag name.

TYPEDEF_HIDES_STRUCT   = NO

# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to
# determine which symbols to keep in memory and which to flush to disk.
# When the cache is full, less often used symbols will be written to disk.
# For small to medium size projects (<1000 input files) the default value is
# probably good enough. For larger projects a too small cache size can cause
# doxygen to be busy swapping symbols to and from disk most of the time
# causing a significant performance penalty.
# If the system has enough physical memory increasing the cache will improve the
# performance by keeping more symbols in memory. Note that the value works on
# a logarithmic scale so increasing the size by one will roughly double the
# memory usage. The cache size is given by this formula:
# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0,
# corresponding to a cache size of 2^16 = 65536 symbols.

SYMBOL_CACHE_SIZE      = 0

# Similar to the SYMBOL_CACHE_SIZE the size of the symbol lookup cache can be
# set using LOOKUP_CACHE_SIZE. This cache is used to resolve symbols given
# their name and scope. Since this can be an expensive process and often the
# same symbol appear multiple times in the code, doxygen keeps a cache of
# pre-resolved symbols. If the cache is too small doxygen will become slower.
# If the cache is too large, memory is wasted. The cache size is given by this
# formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range is 0..9, the default is 0,
# corresponding to a cache size of 2^16 = 65536 symbols.

LOOKUP_CACHE_SIZE      = 0

#---------------------------------------------------------------------------
# Build related configuration options
#---------------------------------------------------------------------------

# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
# documentation are documented, even if no documentation was available.
# Private class members and static file members will be hidden unless
# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES

EXTRACT_ALL            = NO

# If the EXTRACT_PRIVATE tag is set to YES all private members of a class
# will be included in the documentation.

EXTRACT_PRIVATE        = NO

# If the EXTRACT_PACKAGE tag is set to YES all members with package or internal scope will be included in the documentation.

EXTRACT_PACKAGE        = NO

# If the EXTRACT_STATIC tag is set to YES all static members of a file
# will be included in the documentation.

EXTRACT_STATIC         = NO

# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs)
# defined locally in source files will be included in the documentation.
# If set to NO only classes defined in header files are included.

EXTRACT_LOCAL_CLASSES  = YES

# This flag is only useful for Objective-C code. When set to YES local
# methods, which are defined in the implementation section but not in
# the interface are included in the documentation.
# If set to NO (the default) only methods in the interface are included.

EXTRACT_LOCAL_METHODS  = NO

# If this flag is set to YES, the members of anonymous namespaces will be
# extracted and appear in the documentation as a namespace called
# 'anonymous_namespace{file}', where file will be replaced with the base
# name of the file that contains the anonymous namespace. By default
# anonymous namespaces are hidden.

EXTRACT_ANON_NSPACES   = NO

# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all
# undocumented members of documented classes, files or namespaces.
# If set to NO (the default) these members will be included in the
# various overviews, but no documentation section is generated.
# This option has no effect if EXTRACT_ALL is enabled.

HIDE_UNDOC_MEMBERS     = NO

# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all
# undocumented classes that are normally visible in the class hierarchy.
# If set to NO (the default) these classes will be included in the various
# overviews. This option has no effect if EXTRACT_ALL is enabled.

HIDE_UNDOC_CLASSES     = NO

# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all
# friend (class|struct|union) declarations.
# If set to NO (the default) these declarations will be included in the
# documentation.

HIDE_FRIEND_COMPOUNDS  = NO

# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any
# documentation blocks found inside the body of a function.
# If set to NO (the default) these blocks will be appended to the
# function's detailed documentation block.

HIDE_IN_BODY_DOCS      = NO

# The INTERNAL_DOCS tag determines if documentation
# that is typed after a \internal command is included. If the tag is set
# to NO (the default) then the documentation will be excluded.
# Set it to YES to include the internal documentation.

INTERNAL_DOCS          = NO

# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate
# file names in lower-case letters. If set to YES upper-case letters are also
# allowed. This is useful if you have classes or files whose names only differ
# in case and if your file system supports case sensitive file names. Windows
# and Mac users are advised to set this option to NO.

CASE_SENSE_NAMES       = NO

# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen
# will show members with their full class and namespace scopes in the
# documentation. If set to YES the scope will be hidden.

HIDE_SCOPE_NAMES       = NO

# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen
# will put a list of the files that are included by a file in the documentation
# of that file.

SHOW_INCLUDE_FILES     = NO

# If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen
# will list include files with double quotes in the documentation
# rather than with sharp brackets.

FORCE_LOCAL_INCLUDES   = NO

# If the INLINE_INFO tag is set to YES (the default) then a tag [inline]
# is inserted in the documentation for inline members.

INLINE_INFO            = YES

# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen
# will sort the (detailed) documentation of file and class members
# alphabetically by member name. If set to NO the members will appear in
# declaration order.

SORT_MEMBER_DOCS       = YES

# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the
# brief documentation of file, namespace and class members alphabetically
# by member name. If set to NO (the default) the members will appear in
# declaration order.

SORT_BRIEF_DOCS        = NO

# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen
# will sort the (brief and detailed) documentation of class members so that
# constructors and destructors are listed first. If set to NO (the default)
# the constructors will appear in the respective orders defined by
# SORT_MEMBER_DOCS and SORT_BRIEF_DOCS.
# This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO
# and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO.

SORT_MEMBERS_CTORS_1ST = NO

# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the
# hierarchy of group names into alphabetical order. If set to NO (the default)
# the group names will appear in their defined order.

SORT_GROUP_NAMES       = NO

# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be
# sorted by fully-qualified names, including namespaces. If set to
# NO (the default), the class list will be sorted only by class name,
# not including the namespace part.
# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
# Note: This option applies only to the class list, not to the
# alphabetical list.

SORT_BY_SCOPE_NAME     = NO

# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to
# do proper type resolution of all parameters of a function it will reject a
# match between the prototype and the implementation of a member function even
# if there is only one candidate or it is obvious which candidate to choose
# by doing a simple string match. By disabling STRICT_PROTO_MATCHING doxygen
# will still accept a match between prototype and implementation in such cases.

STRICT_PROTO_MATCHING  = NO

# The GENERATE_TODOLIST tag can be used to enable (YES) or
# disable (NO) the todo list. This list is created by putting \todo
# commands in the documentation.

GENERATE_TODOLIST      = YES

# The GENERATE_TESTLIST tag can be used to enable (YES) or
# disable (NO) the test list. This list is created by putting \test
# commands in the documentation.

GENERATE_TESTLIST      = YES

# The GENERATE_BUGLIST tag can be used to enable (YES) or
# disable (NO) the bug list. This list is created by putting \bug
# commands in the documentation.

GENERATE_BUGLIST       = YES

# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or
# disable (NO) the deprecated list. This list is created by putting
# \deprecated commands in the documentation.

GENERATE_DEPRECATEDLIST= YES

# The ENABLED_SECTIONS tag can be used to enable conditional
# documentation sections, marked by \if sectionname ... \endif.

ENABLED_SECTIONS       =

# The MAX_INITIALIZER_LINES tag determines the maximum number of lines
# the initial value of a variable or macro consists of for it to appear in
# the documentation. If the initializer consists of more lines than specified
# here it will be hidden. Use a value of 0 to hide initializers completely.
# The appearance of the initializer of individual variables and macros in the
# documentation can be controlled using \showinitializer or \hideinitializer
# command in the documentation regardless of this setting.

MAX_INITIALIZER_LINES  = 30

# Set the SHOW_USED_FILES tag to NO to disable the list of files generated
# at the bottom of the documentation of classes and structs. If set to YES the
# list will mention the files that were used to generate the documentation.

SHOW_USED_FILES        = YES

# Set the SHOW_FILES tag to NO to disable the generation of the Files page.
# This will remove the Files entry from the Quick Index and from the
# Folder Tree View (if specified). The default is YES.

SHOW_FILES             = YES

# Set the SHOW_NAMESPACES tag to NO to disable the generation of the
# Namespaces page.
# This will remove the Namespaces entry from the Quick Index
# and from the Folder Tree View (if specified). The default is YES.

SHOW_NAMESPACES        = YES

# The FILE_VERSION_FILTER tag can be used to specify a program or script that
# doxygen should invoke to get the current version for each file (typically from
# the version control system). Doxygen will invoke the program by executing (via
# popen()) the command <command> <input-file>, where <command> is the value of
# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file
# provided by doxygen. Whatever the program writes to standard output
# is used as the file version. See the manual for examples.

FILE_VERSION_FILTER    =

# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
# by doxygen. The layout file controls the global structure of the generated
# output files in an output format independent way. To create the layout file
# that represents doxygen's defaults, run doxygen with the -l option.
# You can optionally specify a file name after the option, if omitted
# DoxygenLayout.xml will be used as the name of the layout file.

LAYOUT_FILE            =

# The CITE_BIB_FILES tag can be used to specify one or more bib files
# containing the references data. This must be a list of .bib files. The
# .bib extension is automatically appended if omitted. Using this command
# requires the bibtex tool to be installed. See also
# http://en.wikipedia.org/wiki/BibTeX for more info. For LaTeX the style
# of the bibliography can be controlled using LATEX_BIB_STYLE. To use this
# feature you need bibtex and perl available in the search path.

CITE_BIB_FILES         =

#---------------------------------------------------------------------------
# configuration options related to warning and progress messages
#---------------------------------------------------------------------------

# The QUIET tag can be used to turn on/off the messages that are generated
# by doxygen. Possible values are YES and NO. If left blank NO is used.

QUIET                  = NO

# The WARNINGS tag can be used to turn on/off the warning messages that are
# generated by doxygen. Possible values are YES and NO. If left blank
# NO is used.

WARNINGS               = YES

# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings
# for undocumented members. If EXTRACT_ALL is set to YES then this flag will
# automatically be disabled.

WARN_IF_UNDOCUMENTED   = YES

# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for
# potential errors in the documentation, such as not documenting some
# parameters in a documented function, or documenting parameters that
# don't exist or using markup commands wrongly.

WARN_IF_DOC_ERROR      = YES

# The WARN_NO_PARAMDOC option can be enabled to get warnings for
# functions that are documented, but have no documentation for their parameters
# or return value. If set to NO (the default) doxygen will only warn about
# wrong or incomplete parameter documentation, but not about the absence of
# documentation.

WARN_NO_PARAMDOC       = NO

# The WARN_FORMAT tag determines the format of the warning messages that
# doxygen can produce. The string should contain the $file, $line, and $text
# tags, which will be replaced by the file and line number from which the
# warning originated and the warning text. Optionally the format may contain
# $version, which will be replaced by the version of the file (if it could
# be obtained via FILE_VERSION_FILTER)

WARN_FORMAT            = "$file:$line: $text"

# The WARN_LOGFILE tag can be used to specify a file to which warning
# and error messages should be written. If left blank the output is written
# to stderr.

WARN_LOGFILE           =

#---------------------------------------------------------------------------
# configuration options related to the input files
#---------------------------------------------------------------------------

# The INPUT tag can be used to specify the files and/or directories that contain
# documented source files. You may enter file names like "myfile.cpp" or
# directories like "/usr/src/myproject". Separate the files or directories
# with spaces.

INPUT                  = include/stm.h \
                         include/wrappers.h \
                         include/mod_ab.h \
                         include/mod_cb.h \
                         include/mod_log.h \
                         include/mod_mem.h \
                         include/mod_print.h \
                         include/mod_stats.h

# This tag can be used to specify the character encoding of the source files
# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is
# also the default input encoding. Doxygen uses libiconv (or the iconv built
# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for
# the list of possible encodings.

INPUT_ENCODING         = UTF-8

# If the value of the INPUT tag contains directories, you can use the
# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
# and *.h) to filter out the source-files in the directories. If left
# blank the following patterns are tested:
# *.c *.cc *.cxx *.cpp *.c++ *.d *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh
# *.hxx *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.dox *.py
# *.f90 *.f *.for *.vhd *.vhdl

FILE_PATTERNS          =

# The RECURSIVE tag can be used to turn specify whether or not subdirectories
# should be searched for input files as well. Possible values are YES and NO.
# If left blank NO is used.

RECURSIVE              = NO

# The EXCLUDE tag can be used to specify files and/or directories that should be
# excluded from the INPUT source files. This way you can easily exclude a
# subdirectory from a directory tree whose root is specified with the INPUT tag.
# Note that relative paths are relative to the directory from which doxygen is
# run.

EXCLUDE                =

# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
# directories that are symbolic links (a Unix file system feature) are excluded
# from the input.

EXCLUDE_SYMLINKS       = NO

# If the value of the INPUT tag contains directories, you can use the
# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
# certain files from those directories. Note that the wildcards are matched
# against the file with absolute path, so to exclude all test directories
# for example use the pattern */test/*

EXCLUDE_PATTERNS       =

# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
# (namespaces, classes, functions, etc.) that should be excluded from the
# output. The symbol name can be a fully qualified name, a word, or if the
# wildcard * is used, a substring. Examples: ANamespace, AClass,
# AClass::ANamespace, ANamespace::*Test

EXCLUDE_SYMBOLS        = TXTYPE \
                         TXPARAM \
                         TXPARAMS \
                         TXARG \
                         TXARGS

# The EXAMPLE_PATH tag can be used to specify one or more files or
# directories that contain example code fragments that are included (see
# the \include command).

EXAMPLE_PATH           =

# If the value of the EXAMPLE_PATH tag contains directories, you can use the
# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
# and *.h) to filter out the source-files in the directories. If left
# blank all files are included.

EXAMPLE_PATTERNS       =

# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
# searched for input files to be used with the \include or \dontinclude
# commands irrespective of the value of the RECURSIVE tag.
# Possible values are YES and NO. If left blank NO is used.

EXAMPLE_RECURSIVE      = NO

# The IMAGE_PATH tag can be used to specify one or more files or
# directories that contain image that are included in the documentation (see
# the \image command).

IMAGE_PATH             =

# The INPUT_FILTER tag can be used to specify a program that doxygen should
# invoke to filter for each input file. Doxygen will invoke the filter program
# by executing (via popen()) the command <filter> <input-file>, where <filter>
# is the value of the INPUT_FILTER tag, and <input-file> is the name of an
# input file. Doxygen will then use the output that the filter program writes
# to standard output.
# If FILTER_PATTERNS is specified, this tag will be
# ignored.

INPUT_FILTER           =

# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
# basis.
# Doxygen will compare the file name with each pattern and apply the
# filter if there is a match.
# The filters are a list of the form:
# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further
# info on how filters are used. If FILTER_PATTERNS is empty or if
# non of the patterns match the file name, INPUT_FILTER is applied.

FILTER_PATTERNS        =

# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
# INPUT_FILTER) will be used to filter the input files when producing source
# files to browse (i.e. when SOURCE_BROWSER is set to YES).

FILTER_SOURCE_FILES    = NO

# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
# pattern. A pattern will override the setting for FILTER_PATTERN (if any)
# and it is also possible to disable source filtering for a specific pattern
# using *.ext= (so without naming a filter). This option only has effect when
# FILTER_SOURCE_FILES is enabled.

FILTER_SOURCE_PATTERNS =

#---------------------------------------------------------------------------
# configuration options related to source browsing
#---------------------------------------------------------------------------

# If the SOURCE_BROWSER tag is set to YES then a list of source files will
# be generated. Documented entities will be cross-referenced with these sources.
# Note: To get rid of all source code in the generated output, make sure also
# VERBATIM_HEADERS is set to NO.

SOURCE_BROWSER         = NO

# Setting the INLINE_SOURCES tag to YES will include the body
# of functions and classes directly in the documentation.

INLINE_SOURCES         = NO

# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct
# doxygen to hide any special comment blocks from generated source code
# fragments. Normal C, C++ and Fortran comments will always remain visible.

STRIP_CODE_COMMENTS    = YES

# If the REFERENCED_BY_RELATION tag is set to YES
# then for each documented function all documented
# functions referencing it will be listed.

REFERENCED_BY_RELATION = NO

# If the REFERENCES_RELATION tag is set to YES
# then for each documented function all documented entities
# called/used by that function will be listed.

REFERENCES_RELATION    = NO

# If the REFERENCES_LINK_SOURCE tag is set to YES (the default)
# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from
# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will
# link to the source code.
# Otherwise they will link to the documentation.

REFERENCES_LINK_SOURCE = YES

# If the USE_HTAGS tag is set to YES then the references to source code
# will point to the HTML generated by the htags(1) tool instead of doxygen
# built-in source browser. The htags tool is part of GNU's global source
# tagging system (see http://www.gnu.org/software/global/global.html). You
# will need version 4.8.6 or higher.

USE_HTAGS              = NO

# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen
# will generate a verbatim copy of the header file for each class for
# which an include is specified. Set to NO to disable this.

VERBATIM_HEADERS       = YES

#---------------------------------------------------------------------------
# configuration options related to the alphabetical class index
#---------------------------------------------------------------------------

# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index
# of all compounds will be generated. Enable this if the project
# contains a lot of classes, structs, unions or interfaces.

ALPHABETICAL_INDEX     = NO

# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then
# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns
# in which this list will be split (can be a number in the range [1..20])

COLS_IN_ALPHA_INDEX    = 5

# In case all classes in a project start with a common prefix, all
# classes will be put under the same header in the alphabetical index.
# The IGNORE_PREFIX tag can be used to specify one or more prefixes that
# should be ignored while generating the index headers.

IGNORE_PREFIX          =

#---------------------------------------------------------------------------
# configuration options related to the HTML output
#---------------------------------------------------------------------------

# If the GENERATE_HTML tag is set to YES (the default) Doxygen will
# generate HTML output.

GENERATE_HTML          = YES

# The HTML_OUTPUT tag is used to specify where the HTML docs will be put.
# If a relative path is entered the value of OUTPUT_DIRECTORY will be
# put in front of it. If left blank `html' will be used as the default path.

HTML_OUTPUT            = html

# The HTML_FILE_EXTENSION tag can be used to specify the file extension for
# each generated HTML page (for example: .htm,.php,.asp). If it is left blank
# doxygen will generate files with .html extension.

HTML_FILE_EXTENSION    = .html

# The HTML_HEADER tag can be used to specify a personal HTML header for
# each generated HTML page. If it is left blank doxygen will generate a
# standard header. Note that when using a custom header you are responsible
#  for the proper inclusion of any scripts and style sheets that doxygen
# needs, which is dependent on the configuration options used.
# It is advised to generate a default header using "doxygen -w html
# header.html footer.html stylesheet.css YourConfigFile" and then modify
# that header. Note that the header is subject to change so you typically
# have to redo this when upgrading to a newer version of doxygen or when
# changing the value of configuration settings such as GENERATE_TREEVIEW!

HTML_HEADER            =

# The HTML_FOOTER tag can be used to specify a personal HTML footer for
# each generated HTML page. If it is left blank doxygen will generate a
# standard footer.

HTML_FOOTER            =

# The HTML_STYLESHEET tag can be used to specify a user-defined cascading
# style sheet that is used by each HTML page. It can be used to
# fine-tune the look of the HTML output. If the tag is left blank doxygen
# will generate a default style sheet. Note that doxygen will try to copy
# the style sheet file to the HTML output directory, so don't put your own
# style sheet in the HTML output directory as well, or it will be erased!

HTML_STYLESHEET        =

# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
# other source files which should be copied to the HTML output directory. Note
# that these files will be copied to the base HTML output directory. Use the
# $relpath$ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
# files. In the HTML_STYLESHEET file, use the file name only. Also note that
# the files will be copied as-is; there are no commands or markers available.

HTML_EXTRA_FILES       =

# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output.
# Doxygen will adjust the colors in the style sheet and background images
# according to this color. Hue is specified as an angle on a colorwheel,
# see http://en.wikipedia.org/wiki/Hue for more information.
# For instance the value 0 represents red, 60 is yellow, 120 is green,
# 180 is cyan, 240 is blue, 300 purple, and 360 is red again.
# The allowed range is 0 to 359.

HTML_COLORSTYLE_HUE    = 220

# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of
# the colors in the HTML output. For a value of 0 the output will use
# grayscales only. A value of 255 will produce the most vivid colors.

HTML_COLORSTYLE_SAT    = 100

# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to
# the luminance component of the colors in the HTML output. Values below
# 100 gradually make the output lighter, whereas values above 100 make
# the output darker. The value divided by 100 is the actual gamma applied,
# so 80 represents a gamma of 0.8, The value 220 represents a gamma of 2.2,
# and 100 does not change the gamma.

HTML_COLORSTYLE_GAMMA  = 80

# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
# page will contain the date and time when the page was generated. Setting
# this to NO can help when comparing the output of multiple runs.

HTML_TIMESTAMP         = YES

# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
# documentation will contain sections that can be hidden and shown after the
# page has loaded.

HTML_DYNAMIC_SECTIONS  = NO

# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of
# entries shown in the various tree structured indices initially; the user
# can expand and collapse entries dynamically later on. Doxygen will expand
# the tree to such a level that at most the specified number of entries are
# visible (unless a fully collapsed tree already exceeds this amount).
# So setting the number of entries 1 will produce a full collapsed tree by
# default. 0 is a special value representing an infinite number of entries
# and will result in a full expanded tree by default.

HTML_INDEX_NUM_ENTRIES = 100

# If the GENERATE_DOCSET tag is set to YES, additional index files
# will be generated that can be used as input for Apple's Xcode 3
# integrated development environment, introduced with OSX 10.5 (Leopard).
# To create a documentation set, doxygen will generate a Makefile in the
# HTML output directory. Running make will produce the docset in that
# directory and running "make install" will install the docset in
# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find
# it at startup.
# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
# for more information.

GENERATE_DOCSET        = NO

# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the
# feed. A documentation feed provides an umbrella under which multiple
# documentation sets from a single provider (such as a company or product suite)
# can be grouped.

DOCSET_FEEDNAME        = "Doxygen generated docs"

# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that
# should uniquely identify the documentation set bundle. This should be a
# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen
# will append .docset to the name.

DOCSET_BUNDLE_ID       = org.doxygen.Project

# When GENERATE_PUBLISHER_ID tag specifies a string that should uniquely identify
# the documentation publisher. This should be a reverse domain-name style
# string, e.g. com.mycompany.MyDocSet.documentation.

DOCSET_PUBLISHER_ID    = org.doxygen.Publisher

# The GENERATE_PUBLISHER_NAME tag identifies the documentation publisher.

DOCSET_PUBLISHER_NAME  = Publisher

# If the GENERATE_HTMLHELP tag is set to YES, additional index files
# will be generated that can be used as input for tools like the
# Microsoft HTML help workshop to generate a compiled HTML help file (.chm)
# of the generated HTML documentation.

GENERATE_HTMLHELP      = NO

# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can
# be used to specify the file name of the resulting .chm file. You
# can add a path in front of the file if the result should not be
# written to the html output directory.

CHM_FILE               =

# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can
# be used to specify the location (absolute path including file name) of
# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run
# the HTML help compiler on the generated index.hhp.

HHC_LOCATION           =

# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag
# controls if a separate .chi index file is generated (YES) or that
# it should be included in the master .chm file (NO).

GENERATE_CHI           = NO

# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING
# is used to encode HtmlHelp index (hhk), content (hhc) and project file
# content.

CHM_INDEX_ENCODING     =

# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag
# controls whether a binary table of contents is generated (YES) or a
# normal table of contents (NO) in the .chm file.

BINARY_TOC             = NO

# The TOC_EXPAND flag can be set to YES to add extra items for group members
# to the contents of the HTML help documentation and to the tree view.

TOC_EXPAND             = NO

# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated
# that can be used as input for Qt's qhelpgenerator to generate a
# Qt Compressed Help (.qch) of the generated HTML documentation.

GENERATE_QHP           = NO

# If the QHG_LOCATION tag is specified, the QCH_FILE tag can
# be used to specify the file name of the resulting .qch file.
# The path specified is relative to the HTML output folder.

QCH_FILE               =

# The QHP_NAMESPACE tag specifies the namespace to use when generating
# Qt Help Project output. For more information please see
# http://doc.trolltech.com/qthelpproject.html#namespace

QHP_NAMESPACE          =

# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating
# Qt Help Project output. For more information please see
# http://doc.trolltech.com/qthelpproject.html#virtual-folders

QHP_VIRTUAL_FOLDER     = doc

# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to
# add. For more information please see
# http://doc.trolltech.com/qthelpproject.html#custom-filters

QHP_CUST_FILTER_NAME   =

# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the
# custom filter to add. For more information please see
# <a href="http://doc.trolltech.com/qthelpproject.html#custom-filters">
# Qt Help Project / Custom Filters</a>.

QHP_CUST_FILTER_ATTRS  =

# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
# project's
# filter section matches.
# <a href="http://doc.trolltech.com/qthelpproject.html#filter-attributes">
# Qt Help Project / Filter Attributes</a>.

QHP_SECT_FILTER_ATTRS  =

# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can
# be used to specify the location of Qt's qhelpgenerator.
# If non-empty doxygen will try to run qhelpgenerator on the generated
# .qhp file.

QHG_LOCATION           =

# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files
#  will be generated, which together with the HTML files, form an Eclipse help
# plugin. To install this plugin and make it available under the help contents
# menu in Eclipse, the contents of the directory containing the HTML and XML
# files needs to be copied into the plugins directory of eclipse. The name of
# the directory within the plugins directory should be the same as
# the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before
# the help appears.

GENERATE_ECLIPSEHELP   = NO

# A unique identifier for the eclipse help plugin. When installing the plugin
# the directory name containing the HTML and XML files should also have
# this name.

ECLIPSE_DOC_ID         = org.doxygen.Project

# The DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs)
# at top of each HTML page. The value NO (the default) enables the index and
# the value YES disables it. Since the tabs have the same information as the
# navigation tree you can set this option to NO if you already set
# GENERATE_TREEVIEW to YES.

DISABLE_INDEX          = NO

# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
# structure should be generated to display hierarchical information.
# If the tag value is set to YES, a side panel will be generated
# containing a tree-like index structure (just like the one that
# is generated for HTML Help). For this to work a browser that supports
# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser).
# Windows users are probably better off using the HTML help feature.
# Since the tree basically has the same information as the tab index you
# could consider to set DISABLE_INDEX to NO when enabling this option.

GENERATE_TREEVIEW      = NONE

# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values
# (range [0,1..20]) that doxygen will group on one line in the generated HTML
# documentation. Note that a value of 0 will completely suppress the enum
# values from appearing in the overview section.

ENUM_VALUES_PER_LINE   = 4

# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be
# used to set the initial width (in pixels) of the frame in which the tree
# is shown.

TREEVIEW_WIDTH         = 250

# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open
# links to external symbols imported via tag files in a separate window.

EXT_LINKS_IN_WINDOW    = NO

# Use this tag to change the font size of Latex formulas included
# as images in the HTML documentation. The default is 10. Note that
# when you change the font size after a successful doxygen run you need
# to manually remove any form_*.png images from the HTML output directory
# to force them to be regenerated.

FORMULA_FONTSIZE       = 10

# Use the FORMULA_TRANPARENT tag to determine whether or not the images
# generated for formulas are transparent PNGs. Transparent PNGs are
# not supported properly for IE 6.0, but are supported on all modern browsers.
# Note that when changing this option you need to delete any form_*.png files
# in the HTML output before the changes have effect.

FORMULA_TRANSPARENT    = YES

# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax
# (see http://www.mathjax.org) which uses client side Javascript for the
# rendering instead of using prerendered bitmaps. Use this if you do not
# have LaTeX installed or if you want to formulas look prettier in the HTML
# output. When enabled you may also need to install MathJax separately and
# configure the path to it using the MATHJAX_RELPATH option.

USE_MATHJAX            = NO

# When MathJax is enabled you need to specify the location relative to the
# HTML output directory using the MATHJAX_RELPATH option. The destination
# directory should contain the MathJax.js script. For instance, if the mathjax
# directory is located at the same level as the HTML output directory, then
# MATHJAX_RELPATH should be ../mathjax. The default value points to
# the MathJax Content Delivery Network so you can quickly see the result without
# installing MathJax.
# However, it is strongly recommended to install a local
# copy of MathJax from http://www.mathjax.org before deployment.

MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest

# The MATHJAX_EXTENSIONS tag can be used to specify one or MathJax extension
# names that should be enabled during MathJax rendering.

MATHJAX_EXTENSIONS     =

# When the SEARCHENGINE tag is enabled doxygen will generate a search box
# for the HTML output. The underlying search engine uses javascript
# and DHTML and should work on any modern browser. Note that when using
# HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets
# (GENERATE_DOCSET) there is already a search function so this one should
# typically be disabled. For large projects the javascript based search engine
# can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution.

SEARCHENGINE           = NO

# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
# implemented using a PHP enabled web server instead of at the web client
# using Javascript. Doxygen will generate the search PHP script and index
# file to put on the web server. The advantage of the server
# based approach is that it scales better to large projects and allows
# full text search. The disadvantages are that it is more difficult to setup
# and does not have live searching capabilities.

SERVER_BASED_SEARCH    = NO

#---------------------------------------------------------------------------
# configuration options related to the LaTeX output
#---------------------------------------------------------------------------

# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will
# generate Latex output.

GENERATE_LATEX         = NO

# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put.
# If a relative path is entered the value of OUTPUT_DIRECTORY will be
# put in front of it. If left blank `latex' will be used as the default path.

LATEX_OUTPUT           = latex

# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
# invoked. If left blank `latex' will be used as the default command name.
# Note that when enabling USE_PDFLATEX this option is only used for
# generating bitmaps for formulas in the HTML output, but not in the
# Makefile that is written to the output directory.

LATEX_CMD_NAME         = latex

# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to
# generate index for LaTeX. If left blank `makeindex' will be used as the
# default command name.

MAKEINDEX_CMD_NAME     = makeindex

# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact
# LaTeX documents. This may be useful for small projects and may help to
# save some trees in general.

COMPACT_LATEX          = NO

# The PAPER_TYPE tag can be used to set the paper type that is used
# by the printer. Possible values are: a4, letter, legal and
# executive. If left blank a4wide will be used.

PAPER_TYPE             = a4wide

# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX
# packages that should be included in the LaTeX output.

EXTRA_PACKAGES         =

# The LATEX_HEADER tag can be used to specify a personal LaTeX header for
# the generated latex document. The header should contain everything until
# the first chapter. If it is left blank doxygen will generate a
# standard header. Notice: only use this tag if you know what you are doing!

LATEX_HEADER           =

# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for
# the generated latex document. The footer should contain everything after
# the last chapter. If it is left blank doxygen will generate a
# standard footer. Notice: only use this tag if you know what you are doing!

LATEX_FOOTER           =

# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated
# is prepared for conversion to pdf (using ps2pdf). The pdf file will
# contain links (just like the HTML output) instead of page references
# This makes the output suitable for online browsing using a pdf viewer.

PDF_HYPERLINKS         = YES

# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of
# plain latex in the generated Makefile. Set this option to YES to get a
# higher quality PDF documentation.

USE_PDFLATEX           = YES

# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode.
# command to the generated LaTeX files. This will instruct LaTeX to keep
# running if errors occur, instead of asking the user for help.
# This option is also used when generating formulas in HTML.

LATEX_BATCHMODE        = NO

# If LATEX_HIDE_INDICES is set to YES then doxygen will not
# include the index chapters (such as File Index, Compound Index, etc.)
# in the output.

LATEX_HIDE_INDICES     = NO

# If LATEX_SOURCE_CODE is set to YES then doxygen will include
# source code with syntax highlighting in the LaTeX output.
# Note that which sources are shown also depends on other settings
# such as SOURCE_BROWSER.

LATEX_SOURCE_CODE      = NO

# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
# bibliography, e.g. plainnat, or ieeetr. The default style is "plain". See
# http://en.wikipedia.org/wiki/BibTeX for more info.

LATEX_BIB_STYLE        = plain

#---------------------------------------------------------------------------
# configuration options related to the RTF output
#---------------------------------------------------------------------------

# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output
# The RTF output is optimized for Word 97 and may not look very pretty with
# other RTF readers or editors.

GENERATE_RTF           = NO

# The RTF_OUTPUT tag is used to specify where the RTF docs will be put.
# If a relative path is entered the value of OUTPUT_DIRECTORY will be
# put in front of it. If left blank `rtf' will be used as the default path.

RTF_OUTPUT             = rtf

# If the COMPACT_RTF tag is set to YES Doxygen generates more compact
# RTF documents. This may be useful for small projects and may help to
# save some trees in general.

COMPACT_RTF            = NO

# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated
# will contain hyperlink fields. The RTF file will
# contain links (just like the HTML output) instead of page references.
# This makes the output suitable for online browsing using WORD or other
# programs which support those fields.
# Note: wordpad (write) and others do not support links.

RTF_HYPERLINKS         = NO

# Load style sheet definitions from file. Syntax is similar to doxygen's
# config file, i.e. a series of assignments. You only have to provide
# replacements, missing definitions are set to their default value.

RTF_STYLESHEET_FILE    =

# Set optional variables used in the generation of an rtf document.
# Syntax is similar to doxygen's config file.

RTF_EXTENSIONS_FILE    =

#---------------------------------------------------------------------------
# configuration options related to the man page output
#---------------------------------------------------------------------------

# If the GENERATE_MAN tag is set to YES (the default) Doxygen will
# generate man pages

GENERATE_MAN           = NO

# The MAN_OUTPUT tag is used to specify where the man pages will be put.
# If a relative path is entered the value of OUTPUT_DIRECTORY will be
# put in front of it. If left blank `man' will be used as the default path.

MAN_OUTPUT             = man

# The MAN_EXTENSION tag determines the extension that is added to
# the generated man pages (default is the subroutine's section .3)

MAN_EXTENSION          = .3

# If the MAN_LINKS tag is set to YES and Doxygen generates man output,
# then it will generate one additional man file for each entity
# documented in the real man page(s). These additional files
# only source the real man page, but without them the man command
# would be unable to find the correct page. The default is NO.

MAN_LINKS              = NO

#---------------------------------------------------------------------------
# configuration options related to the XML output
#---------------------------------------------------------------------------

# If the GENERATE_XML tag is set to YES Doxygen will
# generate an XML file that captures the structure of
# the code including all documentation.

GENERATE_XML           = NO

# The XML_OUTPUT tag is used to specify where the XML pages will be put.
# If a relative path is entered the value of OUTPUT_DIRECTORY will be
# put in front of it. If left blank `xml' will be used as the default path.

XML_OUTPUT             = xml

# The XML_SCHEMA tag can be used to specify an XML schema,
# which can be used by a validating XML parser to check the
# syntax of the XML files.

XML_SCHEMA             =

# The XML_DTD tag can be used to specify an XML DTD,
# which can be used by a validating XML parser to check the
# syntax of the XML files.

XML_DTD                =

# If the XML_PROGRAMLISTING tag is set to YES Doxygen will
# dump the program listings (including syntax highlighting
# and cross-referencing information) to the XML output. Note that
# enabling this will significantly increase the size of the XML output.

XML_PROGRAMLISTING     = YES

#---------------------------------------------------------------------------
# configuration options for the AutoGen Definitions output
#---------------------------------------------------------------------------

# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will
# generate an AutoGen Definitions (see autogen.sf.net) file
# that captures the structure of the code including all
# documentation. Note that this feature is still experimental
# and incomplete at the moment.

GENERATE_AUTOGEN_DEF   = NO

#---------------------------------------------------------------------------
# configuration options related to the Perl module output
#---------------------------------------------------------------------------

# If the GENERATE_PERLMOD tag is set to YES Doxygen will
# generate a Perl module file that captures the structure of
# the code including all documentation. Note that this
# feature is still experimental and incomplete at the
# moment.

GENERATE_PERLMOD       = NO

# If the PERLMOD_LATEX tag is set to YES Doxygen will generate
# the necessary Makefile rules, Perl scripts and LaTeX code to be able
# to generate PDF and DVI output from the Perl module output.

PERLMOD_LATEX          = NO

# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be
# nicely formatted so it can be parsed by a human reader.
# This is useful
# if you want to understand what is going on.
# On the other hand, if this
# tag is set to NO the size of the Perl module output will be much smaller
# and Perl will parse it just the same.

PERLMOD_PRETTY         = YES

# The names of the make variables in the generated doxyrules.make file
# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX.
# This is useful so different doxyrules.make files included by the same
# Makefile don't overwrite each other's variables.

PERLMOD_MAKEVAR_PREFIX =

#---------------------------------------------------------------------------
# Configuration options related to the preprocessor
#---------------------------------------------------------------------------

# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will
# evaluate all C-preprocessor directives found in the sources and include
# files.

ENABLE_PREPROCESSING   = YES

# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro
# names in the source code. If set to NO (the default) only conditional
# compilation will be performed. Macro expansion can be done in a controlled
# way by setting EXPAND_ONLY_PREDEF to YES.

MACRO_EXPANSION        = YES

# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES
# then the macro expansion is limited to the macros specified with the
# PREDEFINED and EXPAND_AS_DEFINED tags.

EXPAND_ONLY_PREDEF     = NO

# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files
# pointed to by INCLUDE_PATH will be searched when a #include is found.

SEARCH_INCLUDES        = YES

# The INCLUDE_PATH tag can be used to specify one or more directories that
# contain include files that are not input files but should be processed by
# the preprocessor.

INCLUDE_PATH           =

# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
# patterns (like *.h and *.hpp) to filter out the header-files in the
# directories. If left blank, the patterns specified with FILE_PATTERNS will
# be used.

INCLUDE_FILE_PATTERNS  =

# The PREDEFINED tag can be used to specify one or more macro names that
# are defined before the preprocessor is started (similar to the -D option of
# gcc). The argument of the tag is a list of macros of the form: name
# or name=definition (no spaces). If the definition and the = are
# omitted =1 is assumed. To prevent a macro definition from being
# undefined via #undef or recursively expanded use the := operator
# instead of the = operator.

PREDEFINED             =

# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then
# this tag can be used to specify a list of macro names that should be expanded.
# The macro definition that is found in the sources will be used.
# Use the PREDEFINED tag if you want to use a different macro definition that
# overrules the definition found in the source code.

EXPAND_AS_DEFINED      =

# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then
# doxygen's preprocessor will remove all references to function-like macros
# that are alone on a line, have an all uppercase name, and do not end with a
# semicolon, because these will confuse the parser if not removed.

SKIP_FUNCTION_MACROS   = YES

#---------------------------------------------------------------------------
# Configuration::additions related to external references
#---------------------------------------------------------------------------

# The TAGFILES option can be used to specify one or more tagfiles. For each
# tag file the location of the external documentation should be added. The
# format of a tag file without this location is as follows:
#
# TAGFILES = file1 file2 ...
# Adding location for the tag files is done as follows:
#
# TAGFILES = file1=loc1 "file2 = loc2" ...
# where "loc1" and "loc2" can be relative or absolute paths
# or URLs. Note that each tag file must have a unique name (where the name does
# NOT include the path). If a tag file is not located in the directory in which
# doxygen is run, you must also specify the path to the tagfile here.

TAGFILES               =

# When a file name is specified after GENERATE_TAGFILE, doxygen will create
# a tag file that is based on the input files it reads.

GENERATE_TAGFILE       =

# If the ALLEXTERNALS tag is set to YES all external classes will be listed
# in the class index. If set to NO only the inherited external classes
# will be listed.

ALLEXTERNALS           = NO

# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed
# in the modules index. If set to NO, only the current project's groups will
# be listed.

EXTERNAL_GROUPS        = YES

# The PERL_PATH should be the absolute path and name of the perl script
# interpreter (i.e. the result of `which perl').

PERL_PATH              = /usr/bin/perl

#---------------------------------------------------------------------------
# Configuration options related to the dot tool
#---------------------------------------------------------------------------

# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will
# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base
# or super classes. Setting the tag to NO turns the diagrams off. Note that
# this option also works with HAVE_DOT disabled, but it is recommended to
# install and use dot, since it yields more powerful graphs.

CLASS_DIAGRAMS         = YES

# You can define message sequence charts within doxygen comments using the \msc
# command. Doxygen will then run the mscgen tool (see
# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the
# documentation. The MSCGEN_PATH tag allows you to specify the directory where
# the mscgen tool resides. If left empty the tool is assumed to be found in the
# default search path.

MSCGEN_PATH            =

# If set to YES, the inheritance and collaboration graphs will hide
# inheritance and usage relations if the target is undocumented
# or is not a class.

HIDE_UNDOC_RELATIONS   = YES

# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
# available from the path. This tool is part of Graphviz, a graph visualization
# toolkit from AT&T and Lucent Bell Labs. The other options in this section
# have no effect if this option is set to NO (the default)

HAVE_DOT               = NO

# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is
# allowed to run in parallel. When set to 0 (the default) doxygen will
# base this on the number of processors available in the system. You can set it
# explicitly to a value larger than 0 to get control over the balance
# between CPU load and processing speed.

DOT_NUM_THREADS        = 0

# By default doxygen will use the Helvetica font for all dot files that
# doxygen generates. When you want a differently looking font you can specify
# the font name using DOT_FONTNAME. You need to make sure dot is able to find
# the font, which can be done by putting it in a standard location or by setting
# the DOTFONTPATH environment variable or by setting DOT_FONTPATH to the
# directory containing the font.

DOT_FONTNAME           = FreeSans

# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs.
# The default size is 10pt.

DOT_FONTSIZE           = 10

# By default doxygen will tell dot to use the Helvetica font.
# If you specify a different font using DOT_FONTNAME you can use DOT_FONTPATH to
# set the path where dot can find it.

DOT_FONTPATH           =

# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen
# will generate a graph for each documented class showing the direct and
# indirect inheritance relations. Setting this tag to YES will force the
# CLASS_DIAGRAMS tag to NO.

CLASS_GRAPH            = YES

# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen
# will generate a graph for each documented class showing the direct and
# indirect implementation dependencies (inheritance, containment, and
# class references variables) of the class with other documented classes.

COLLABORATION_GRAPH    = YES

# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen
# will generate a graph for groups, showing the direct groups dependencies

GROUP_GRAPHS           = YES

# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
# collaboration diagrams in a style similar to the OMG's Unified Modeling
# Language.

UML_LOOK               = NO

# If the UML_LOOK tag is enabled, the fields and methods are shown inside
# the class node. If there are many fields or methods and many nodes the
# graph may become too big to be useful. The UML_LIMIT_NUM_FIELDS
# threshold limits the number of items for each type to make the size more
# managable. Set this to 0 for no limit. Note that the threshold may be
# exceeded by 50% before the limit is enforced.

UML_LIMIT_NUM_FIELDS   = 10

# If set to YES, the inheritance and collaboration graphs will show the
# relations between templates and their instances.

TEMPLATE_RELATIONS     = NO

# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT
# tags are set to YES then doxygen will generate a graph for each documented
# file showing the direct and indirect include dependencies of the file with
# other documented files.

INCLUDE_GRAPH          = YES

# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and
# HAVE_DOT tags are set to YES then doxygen will generate a graph for each
# documented header file showing the documented files that directly or
# indirectly include this file.

INCLUDED_BY_GRAPH      = YES

# If the CALL_GRAPH and HAVE_DOT options are set to YES then
# doxygen will generate a call dependency graph for every global function
# or class method. Note that enabling this option will significantly increase
# the time of a run. So in most cases it will be better to enable call graphs
# for selected functions only using the \callgraph command.

CALL_GRAPH             = NO

# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then
# doxygen will generate a caller dependency graph for every global function
# or class method. Note that enabling this option will significantly increase
# the time of a run. So in most cases it will be better to enable caller
# graphs for selected functions only using the \callergraph command.

CALLER_GRAPH           = NO

# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen
# will generate a graphical hierarchy of all classes instead of a textual one.

GRAPHICAL_HIERARCHY    = YES

# If the DIRECTORY_GRAPH and HAVE_DOT tags are set to YES
# then doxygen will show the dependencies a directory has on other directories
# in a graphical way. The dependency relations are determined by the #include
# relations between the files in the directories.

DIRECTORY_GRAPH        = YES

# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
# generated by dot. Possible values are svg, png, jpg, or gif.
# If left blank png will be used. If you choose svg you need to set
# HTML_FILE_EXTENSION to xhtml in order to make the SVG files
# visible in IE 9+ (other browsers do not have this requirement).

DOT_IMAGE_FORMAT       = png

# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
# enable generation of interactive SVG images that allow zooming and panning.
# Note that this requires a modern browser other than Internet Explorer.
# Tested and working are Firefox, Chrome, Safari, and Opera. For IE 9+ you
# need to set HTML_FILE_EXTENSION to xhtml in order to make the SVG files
# visible. Older versions of IE do not have SVG support.

INTERACTIVE_SVG        = NO

# The tag DOT_PATH can be used to specify the path where the dot tool can be
# found. If left blank, it is assumed the dot tool can be found in the path.

DOT_PATH               =

# The DOTFILE_DIRS tag can be used to specify one or more directories that
# contain dot files that are included in the documentation (see the
# \dotfile command).

DOTFILE_DIRS           =

# The MSCFILE_DIRS tag can be used to specify one or more directories that
# contain msc files that are included in the documentation (see the
# \mscfile command).

MSCFILE_DIRS           =

# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of
# nodes that will be shown in the graph. If the number of nodes in a graph
# becomes larger than this value, doxygen will truncate the graph, which is
# visualized by representing a node as a red box. Note that doxygen if the
# number of direct children of the root node in a graph is already larger than
# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note
# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.

DOT_GRAPH_MAX_NODES    = 50

# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the
# graphs generated by dot. A depth value of 3 means that only nodes reachable
# from the root by following a path via at most 3 edges will be shown. Nodes
# that lay further from the root node will be omitted. Note that setting this
# option to 1 or 2 may greatly reduce the computation time needed for large
# code bases. Also note that the size of a graph can be further restricted by
# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.

MAX_DOT_GRAPH_DEPTH    = 0

# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
# background. This is disabled by default, because dot on Windows does not
# seem to support this out of the box. Warning: Depending on the platform used,
# enabling this option may lead to badly anti-aliased labels on the edges of
# a graph (i.e. they become hard to read).

DOT_TRANSPARENT        = YES

# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
# files in one run (i.e. multiple -o and -T options on the command line). This
# makes dot run faster, but since only newer versions of dot (>1.8.10)
# support this, this feature is disabled by default.

DOT_MULTI_TARGETS      = NO

# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will
# generate a legend page explaining the meaning of the various boxes and
# arrows in the dot generated graphs.

GENERATE_LEGEND        = YES

# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will
# remove the intermediate dot files that are used to generate
# the various graphs.

DOT_CLEANUP            = YES


================================================
FILE: stms/tinystm/GNU-LICENSE.txt
================================================
		    GNU GENERAL PUBLIC LICENSE
		       Version 2, June 1991

 Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 Everyone is permitted to copy and distribute verbatim copies
 of this license document, but changing it is not allowed.

			    Preamble

  The licenses for most software are designed to take away your
freedom to share and change it.  By contrast, the GNU General Public
License is intended to guarantee your freedom to share and change free
software--to make sure the software is free for all its users.  This
General Public License applies to most of the Free Software
Foundation's software and to any other program whose authors commit to
using it.  (Some other Free Software Foundation software is covered by
the GNU Lesser General Public License instead.)  You can apply it to
your programs, too.

  When we speak of free software, we are referring to freedom, not
price.  Our General Public Licenses are designed to make sure that you
have the freedom to distribute copies of free software (and charge for
this service if you wish), that you receive source code or can get it
if you want it, that you can change the software or use pieces of it
in new free programs; and that you know you can do these things.

  To protect your rights, we need to make restrictions that forbid
anyone to deny you these rights or to ask you to surrender the rights.
These restrictions translate to certain responsibilities for you if you
distribute copies of the software, or if you modify it.

  For example, if you distribute copies of such a program, whether
gratis or for a fee, you must give the recipients all the rights that
you have.  You must make sure that they, too, receive or can get the
source code.  And you must show them these terms so they know their
rights.

  We protect your rights with two steps: (1) copyright the software, and
(2) offer you this license which gives you legal permission to copy,
distribute and/or modify the software.

  Also, for each author's protection and ours, we want to make certain
that everyone understands that there is no warranty for this free
software.  If the software is modified by someone else and passed on, we
want its recipients to know that what they have is not the original, so
that any problems introduced by others will not reflect on the original
authors' reputations.

  Finally, any free program is threatened constantly by software
patents.  We wish to avoid the danger that redistributors of a free
program will individually obtain patent licenses, in effect making the
program proprietary.  To prevent this, we have made it clear that any
patent must be licensed for everyone's free use or not licensed at all.

  The precise terms and conditions for copying, distribution and
modification follow.

		    GNU GENERAL PUBLIC LICENSE
   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION

  0. This License applies to any program or other work which contains
a notice placed by the copyright holder saying it may be distributed
under the terms of this General Public License.  The "Program", below,
refers to any such program or work, and a "work based on the Program"
means either the Program or any derivative work under copyright law:
that is to say, a work containing the Program or a portion of it,
either verbatim or with modifications and/or translated into another
language.  (Hereinafter, translation is included without limitation in
the term "modification".)  Each licensee is addressed as "you".

Activities other than copying, distribution and modification are not
covered by this License; they are outside its scope.  The act of
running the Program is not restricted, and the output from the Program
is covered only if its contents constitute a work based on the
Program (independent of having been made by running the Program).
Whether that is true depends on what the Program does.

  1. You may copy and distribute verbatim copies of the Program's
source code as you receive it, in any medium, provided that you
conspicuously and appropriately publish on each copy an appropriate
copyright notice and disclaimer of warranty; keep intact all the
notices that refer to this License and to the absence of any warranty;
and give any other recipients of the Program a copy of this License
along with the Program.

You may charge a fee for the physical act of transferring a copy, and
you may at your option offer warranty protection in exchange for a fee.

  2. You may modify your copy or copies of the Program or any portion
of it, thus forming a work based on the Program, and copy and
distribute such modifications or work under the terms of Section 1
above, provided that you also meet all of these conditions:

    a) You must cause the modified files to carry prominent notices
    stating that you changed the files and the date of any change.

    b) You must cause any work that you distribute or publish, that in
    whole or in part contains or is derived from the Program or any
    part thereof, to be licensed as a whole at no charge to all third
    parties under the terms of this License.

    c) If the modified program normally reads commands interactively
    when run, you must cause it, when started running for such
    interactive use in the most ordinary way, to print or display an
    announcement including an appropriate copyright notice and a
    notice that there is no warranty (or else, saying that you provide
    a warranty) and that users may redistribute the program under
    these conditions, and telling the user how to view a copy of this
    License.  (Exception: if the Program itself is interactive but
    does not normally print such an announcement, your work based on
    the Program is not required to print an announcement.)

These requirements apply to the modified work as a whole.  If
identifiable sections of that work are not derived from the Program,
and can be reasonably considered independent and separate works in
themselves, then this License, and its terms, do not apply to those
sections when you distribute them as separate works.  But when you
distribute the same sections as part of a whole which is a work based
on the Program, the distribution of the whole must be on the terms of
this License, whose permissions for other licensees extend to the
entire whole, and thus to each and every part regardless of who wrote it.

Thus, it is not the intent of this section to claim rights or contest
your rights to work written entirely by you; rather, the intent is to
exercise the right to control the distribution of derivative or
collective works based on the Program.

In addition, mere aggregation of another work not based on the Program
with the Program (or with a work based on the Program) on a volume of
a storage or distribution medium does not bring the other work under
the scope of this License.

  3. You may copy and distribute the Program (or a work based on it,
under Section 2) in object code or executable form under the terms of
Sections 1 and 2 above provided that you also do one of the following:

    a) Accompany it with the complete corresponding machine-readable
    source code, which must be distributed under the terms of Sections
    1 and 2 above on a medium customarily used for software interchange; or,

    b) Accompany it with a written offer, valid for at least three
    years, to give any third party, for a charge no more than your
    cost of physically performing source distribution, a complete
    machine-readable copy of the corresponding source code, to be
    distributed under the terms of Sections 1 and 2 above on a medium
    customarily used for software interchange; or,

    c) Accompany it with the information you received as to the offer
    to distribute corresponding source code.  (This alternative is
    allowed only for noncommercial distribution and only if you
    received the program in object code or executable form with such
    an offer, in accord with Subsection b above.)

The source code for a work means the preferred form of the work for
making modifications to it.  For an executable work, complete source
code means all the source code for all modules it contains, plus any
associated interface definition files, plus the scripts used to
control compilation and installation of the executable.  However, as a
special exception, the source code distributed need not include
anything that is normally distributed (in either source or binary
form) with the major components (compiler, kernel, and so on) of the
operating system on which the executable runs, unless that component
itself accompanies the executable.

If distribution of executable or object code is made by offering
access to copy from a designated place, then offering equivalent
access to copy the source code from the same place counts as
distribution of the source code, even though third parties are not
compelled to copy the source along with the object code.

  4. You may not copy, modify, sublicense, or distribute the Program
except as expressly provided under this License.  Any attempt
otherwise to copy, modify, sublicense or distribute the Program is
void, and will automatically terminate your rights under this License.
However, parties who have received copies, or rights, from you under
this License will not have their licenses terminated so long as such
parties remain in full compliance.

  5. You are not required to accept this License, since you have not
signed it.  However, nothing else grants you permission to modify or
distribute the Program or its derivative works.  These actions are
prohibited by law if you do not accept this License.  Therefore, by
modifying or distributing the Program (or any work based on the
Program), you indicate your acceptance of this License to do so, and
all its terms and conditions for copying, distributing or modifying
the Program or works based on it.

  6. Each time you redistribute the Program (or any work based on the
Program), the recipient automatically receives a license from the
original licensor to copy, distribute or modify the Program subject to
these terms and conditions.  You may not impose any further
restrictions on the recipients' exercise of the rights granted herein.
You are not responsible for enforcing compliance by third parties to
this License.

  7. If, as a consequence of a court judgment or allegation of patent
infringement or for any other reason (not limited to patent issues),
conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License.  If you cannot
distribute so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you
may not distribute the Program at all.  For example, if a patent
license would not permit royalty-free redistribution of the Program by
all those who receive copies directly or indirectly through you, then
the only way you could satisfy both it and this License would be to
refrain entirely from distribution of the Program.

If any portion of this section is held invalid or unenforceable under
any particular circumstance, the balance of the section is intended to
apply and the section as a whole is intended to apply in other
circumstances.

It is not the purpose of this section to induce you to infringe any
patents or other property right claims or to contest validity of any
such claims; this section has the sole purpose of protecting the
integrity of the free software distribution system, which is
implemented by public license practices.  Many people have made
generous contributions to the wide range of software distributed
through that system in reliance on consistent application of that
system; it is up to the author/donor to decide if he or she is willing
to distribute software through any other system and a licensee cannot
impose that choice.

This section is intended to make thoroughly clear what is believed to
be a consequence of the rest of this License.

  8. If the distribution and/or use of the Program is restricted in
certain countries either by patents or by copyrighted interfaces, the
original copyright holder who places the Program under this License
may add an explicit geographical distribution limitation excluding
those countries, so that distribution is permitted only in or among
countries not thus excluded.  In such case, this License incorporates
the limitation as if written in the body of this License.

  9. The Free Software Foundation may publish revised and/or new versions
of the General Public License from time to time.  Such new versions will
be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.

Each version is given a distinguishing version number.  If the Program
specifies a version number of this License which applies to it and "any
later version", you have the option of following the terms and conditions
either of that version or of any later version published by the Free
Software Foundation.  If the Program does not specify a version number of
this License, you may choose any version ever published by the Free Software
Foundation.

  10. If you wish to incorporate parts of the Program into other free
programs whose distribution conditions are different, write to the author
to ask for permission.  For software which is copyrighted by the Free
Software Foundation, write to the Free Software Foundation; we sometimes
make exceptions for this.  Our decision will be guided by the two goals
of preserving the free status of all derivatives of our free software and
of promoting the sharing and reuse of software generally.

			    NO WARRANTY

  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
REPAIR OR CORRECTION.

  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
POSSIBILITY OF SUCH DAMAGES.

		     END OF TERMS AND CONDITIONS

	    How to Apply These Terms to Your New Programs

  If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these terms.

  To do so, attach the following notices to the program.  It is safest
to attach them to the start of each source file to most effectively
convey the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.

    <one line to give the program's name and a brief idea of what it does.>
    Copyright (C) <year>  <name of author>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

Also add information on how to contact you by electronic and paper mail.

If the program is interactive, make it output a short notice like this
when it starts in an interactive mode:

    Gnomovision version 69, Copyright (C) year name of author
    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
    This is free software, and you are welcome to redistribute it
    under certain conditions; type `show c' for details.

The hypothetical commands `show w' and `show c' should show the appropriate
parts of the General Public License.  Of course, the commands you use may
be called something other than `show w' and `show c'; they could even be
mouse-clicks or menu items--whatever suits your program.

You should also get your employer (if you work as a programmer) or your
school, if any, to sign a "copyright disclaimer" for the program, if
necessary.  Here is a sample; alter the names:

  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
  `Gnomovision' (which makes passes at compilers) written by James Hacker.

  <signature of Ty Coon>, 1 April 1989
  Ty Coon, President of Vice

This General Public License does not permit incorporating your program into
proprietary programs.  If your program is a subroutine library, you may
consider it more useful to permit linking proprietary applications with the
library.  If this is what you want to do, use the GNU Lesser General
Public License instead of this License.


================================================
FILE: stms/tinystm/MIT-LICENSE.txt
================================================
Copyright (c) 2007-2011
  Pascal Felber <pascal.felber@unine.ch>
  Patrick Marlier <patrick.marlier@unine.ch>

Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:

The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


================================================
FILE: stms/tinystm/Makefile
================================================
include Makefile.common

########################################################################
# TinySTM can be configured in many ways.  The main compilation options
# are described below.  To read more easily through the code, you can
# generate a source file stripped from most of the conditional
# preprocessor directives using:
#
#   make src/stm.o.c
#
# For more details on the LSA algorithm and the design of TinySTM, refer
# to:
#
# [DISC-06] Torvald Riegel, Pascal Felber, and Christof Fetzer.  A Lazy
#   Snapshot Algorithm with Eager Validation.  20th International
#   Symposium on Distributed Computing (DISC), 2006.
#
# [PPoPP-08] Pascal Felber, Christof Fetzer, and Torvald Riegel.
#   Dynamic Performance Tuning of Word-Based Software Transactional
#   Memory.  Proceedings of the 13th ACM SIGPLAN Symposium on Principles
#   and Practice of Parallel Programming (PPoPP), 2008.
########################################################################

########################################################################
# Three different designs can be chosen from, which differ in when locks
# are acquired (encounter-time vs. commit-time), and when main memory is
# updated (write-through vs. write-back).
#
# WRITE_BACK_ETL: write-back with encounter-time locking acquires lock
#   when encountering write operations and buffers updates (they are
#   committed to main memory at commit time).
#
# WRITE_BACK_CTL: write-back with commit-time locking delays acquisition
#   of lock until commit time and buffers updates.
#
# WRITE_THROUGH: write-through (encounter-time locking) directly updates
#   memory and keeps an undo log for possible rollback.
#
# Refer to [PPoPP-08] for more details.
########################################################################

DEFINES += -DDESIGN=WRITE_BACK_ETL
# DEFINES += -DDESIGN=WRITE_BACK_CTL
# DEFINES += -DDESIGN=WRITE_THROUGH
# DEFINES += -DDESIGN=MODULAR

########################################################################
# Several contention management strategies are available:
#
# CM_SUICIDE: immediately abort the transaction that detects the
#   conflict.
#
# CM_DELAY: like CM_SUICIDE but wait until the contended lock that
#   caused the abort (if any) has been released before restarting the
#   transaction.  The intuition is that the transaction will likely try
#   again to acquire the same lock and might fail once more if it has
#   not been released.  In addition, this increases the chances that the
#   transaction can succeed with no interruption upon retry, which
#   improves execution time on the processor.
#
# CM_BACKOFF: like CM_SUICIDE but wait for a random delay before
#   restarting the transaction.  The delay duration is chosen uniformly
#   at random from a range whose size increases exponentially with every
#   restart.
#
# CM_MODULAR: supports several built-in contention managers.  At the
#   time, the following ones are supported:
#   - SUICIDE: kill current transaction (i.e., the transaction that
#     discovers the conflict).
#   - AGGRESSIVE: kill other transaction.
#   - DELAY: same as SUICIDE but wait for conflict resolution before
#     restart.
#   - TIMESTAMP: kill youngest transaction.
#   One can also register custom contention managers.
########################################################################

# Pick one contention manager (CM)
DEFINES += -DCM=CM_SUICIDE
# DEFINES += -DCM=CM_DELAY
# DEFINES += -DCM=CM_BACKOFF
# DEFINES += -DCM=CM_MODULAR

########################################################################
# Enable irrevocable mode (required for using the library with a
# compiler).
########################################################################

DEFINES += -DIRREVOCABLE_ENABLED
# DEFINES += -UIRREVOCABLE_ENABLED

########################################################################
# Maintain detailed internal statistics.  Statistics are stored in
# thread locals and do not add much overhead, so do not expect much gain
# from disabling them.
########################################################################

# DEFINES += -DTM_STATISTICS
DEFINES += -UTM_STATISTICS
# DEFINES += -DTM_STATISTICS2
DEFINES += -UTM_STATISTICS2

########################################################################
# Prevent duplicate entries in read/write sets when accessing the same
# address multiple times.  Enabling this option may reduce performance
# so leave it disabled unless transactions repeatedly read or write the
# same address.
########################################################################

# DEFINES += -DNO_DUPLICATES_IN_RW_SETS
DEFINES += -UNO_DUPLICATES_IN_RW_SETS

########################################################################
# Yield the processor when waiting for a contended lock to be released.
# This only applies to the DELAY and CM_MODULAR contention managers.
########################################################################

# DEFINES += -DWAIT_YIELD
DEFINES += -UWAIT_YIELD

########################################################################
# Use a (degenerate) bloom filter for quickly checking in the write set
# whether an address has previously been written.  This approach is
# directly inspired by TL2.  It only applies to the WRITE_BACK_CTL
# design.
########################################################################

# DEFINES += -DUSE_BLOOM_FILTER
DEFINES += -UUSE_BLOOM_FILTER

########################################################################
# Use an epoch-based memory allocator and garbage collector to ensure
# that accesses to the dynamic memory allocated by a transaction from
# another transaction are valid.  There is a slight overhead from
# enabling this feature.
########################################################################

# DEFINES += -DEPOCH_GC
DEFINES += -UEPOCH_GC

########################################################################
# Keep track of conflicts between transactions and notifies the
# application (using a callback), passing the identity of the two
# conflicting transaction and the associated threads.  This feature
# requires EPOCH_GC.
########################################################################

# DEFINES += -DCONFLICT_TRACKING
DEFINES += -UCONFLICT_TRACKING

########################################################################
# Allow transactions to read the previous version of locked memory
# locations, as in the original LSA algorithm (see [DISC-06]).  This is
# achieved by peeking into the write set of the transaction that owns
# the lock.  There is a small overhead with non-contended workloads but
# it may significantly reduce the abort rate, especially with
# transactions that read much data.  This feature only works with the
# WRITE_BACK_ETL design and MODULAR contention manager.
########################################################################

# DEFINES += -DREAD_LOCKED_DATA
DEFINES += -UREAD_LOCKED_DATA

########################################################################
# Tweak the hash function that maps addresses to locks so that
# consecutive addresses do not map to consecutive locks.  This can avoid
# cache line invalidations for application that perform sequential
# memory accesses.  The last byte of the lock index is swapped with the
# previous byte.
########################################################################

# DEFINES += -DLOCK_IDX_SWAP
DEFINES += -ULOCK_IDX_SWAP

########################################################################
# Output many (DEBUG) or even mode (DEBUG2) debugging messages.
########################################################################

# DEFINES += -DDEBUG
DEFINES += -UDEBUG
# DEFINES += -DDEBUG2
DEFINES += -UDEBUG2

########################################################################
# Catch SIGBUS and SIGSEGV signals
########################################################################

# DEFINES += -DSIGNAL_HANDLER
DEFINES += -USIGNAL_HANDLER

# TODO Enable the construction of 32bit lib on 64bit environment 

########################################################################
# Use COMPILER thread-local storage (TLS) support by default
# TLS_COMPILER: use __thread keyword
# TLS_POSIX: use posix (pthread) functions
# TLS_DARWIN: use posix inline functions
# TLS_GLIBC: use the space reserved for TM in the GLIBC
########################################################################

DEFINES += -DTLS_COMPILER
# DEFINES += -DTLS_POSIX
# DEFINES += -DTLS_DARWIN
# DEFINES += -DTLS_GLIBC

########################################################################
# Enable unit transaction
########################################################################

# DEFINES += -DUNIT_TX
DEFINES += -UUNIT_TX

########################################################################
# Various default values can also be overridden:
#
# RW_SET_SIZE (default=4096): initial size of the read and write
#   sets.  These sets will grow dynamically when they become full.
#
# LOCK_ARRAY_LOG_SIZE (default=20): number of bits used for indexes in
#   the lock array.  The size of the array will be 2 to the power of
#   LOCK_ARRAY_LOG_SIZE.
#
# LOCK_SHIFT_EXTRA (default=2): additional shifts to apply to the
#   address when determining its index in the lock array.  This controls
#   how many consecutive memory words will be covered by the same lock
#   (2 to the power of LOCK_SHIFT_EXTRA).  Higher values will increase
#   false sharing but reduce the number of CASes necessary to acquire
#   locks and may avoid cache line invalidations on some workloads.  As
#   shown in [PPoPP-08], a value of 2 seems to offer best performance on
#   many benchmarks.
#
# MIN_BACKOFF (default=0x04UL) and MAX_BACKOFF (default=0x80000000UL):
#   minimum and maximum values of the exponential backoff delay.  This
#   parameter is only used with the CM_BACKOFF contention manager.
#
# VR_THRESHOLD_DEFAULT (default=3): number of aborts due to failed
#   validation before switching to visible reads.  A value of 0
#   indicates no limit.  This parameter is only used with the
#   CM_MODULAR contention manager.  It can also be set using an
#   environment variable of the same name.
########################################################################

# DEFINES += -DRW_SET_SIZE=4096
# DEFINES += -DLOCK_ARRAY_LOG_SIZE=20
# DEFINES += -DLOCK_SHIFT_EXTRA=2
# DEFINES += -DMIN_BACKOFF=0x04UL
# DEFINES += -DMAX_BACKOFF=0x80000000UL
# DEFINES += -DVR_THRESHOLD_DEFAULT=3

########################################################################
# Do not modify anything below this point!
########################################################################

# Replace textual values by constants for unifdef...
D := $(DEFINES)
D := $(D:WRITE_BACK_ETL=0)
D := $(D:WRITE_BACK_CTL=1)
D := $(D:WRITE_THROUGH=2)
D := $(D:MODULAR=3)
D += -DWRITE_BACK_ETL=0 -DWRITE_BACK_CTL=1 -DWRITE_THROUGH=2 -DMODULAR=3
D := $(D:CM_SUICIDE=0)
D := $(D:CM_DELAY=1)
D := $(D:CM_BACKOFF=2)
D := $(D:CM_MODULAR=3)
D += -DCM_SUICIDE=0 -DCM_DELAY=1 -DCM_BACKOFF=2 -DCM_MODULAR=3

ifneq (,$(findstring -DEPOCH_GC,$(DEFINES)))
  GC := $(SRCDIR)/gc.o
else
  GC :=
endif

CPPFLAGS += -I$(SRCDIR)
CPPFLAGS += $(DEFINES)

MODULES := $(patsubst %.c,%.o,$(wildcard $(SRCDIR)/mod_*.c))

.PHONY:	all doc test abi clean check

all:	$(TMLIB)

%.o:	%.c Makefile
	$(CC) $(CPPFLAGS) $(CFLAGS) -DCOMPILE_FLAGS="$(CPPFLAGS) $(CFLAGS)" -c -o $@ $<

# Additional dependencies
$(SRCDIR)/stm.o:	$(INCDIR)/stm.h
$(SRCDIR)/stm.o:	$(SRCDIR)/stm_internal.h $(SRCDIR)/stm_wt.h $(SRCDIR)/stm_wbetl.h $(SRCDIR)/stm_wbctl.h $(SRCDIR)/tls.h $(SRCDIR)/utils.h $(SRCDIR)/atomic.h

%.s:	%.c Makefile
	$(CC) $(CPPFLAGS) $(CFLAGS) -DCOMPILE_FLAGS="$(CPPFLAGS) $(CFLAGS)" -fverbose-asm -S -o $@ $<

%.o.c:	%.c Makefile
	$(UNIFDEF) $(D) $< > $@ || true

$(TMLIB):	$(SRCDIR)/$(TM).o $(SRCDIR)/wrappers.o $(GC) $(MODULES)
	$(AR) crus $@ $^

test:	$(TMLIB)
	$(MAKE) -C test

abi:
	$(MAKE) -C abi

abi-%: 	
	$(MAKE) -C abi $(subst abi-,,$@)

doc:
	$(DOXYGEN)

check: 	$(TMLIB)
	$(MAKE) -C test check

# TODO add an install rule
#install: 	$(TMLIB)

clean:
	rm -f $(TMLIB) $(SRCDIR)/*.o
	$(MAKE) -C abi clean
	TARGET=clean $(MAKE) -C test


================================================
FILE: stms/tinystm/Makefile.clang
================================================
# Debug/optimization flags (optimized by default)
ifeq ($(CFG),debug)
  CFLAGS += -O0 -ggdb3
else
  # For Link Time Optimization, it requires gold linker and clang compiled with --enable-gold
  # CFLAGS += -O4
  # LDFLAGS += -use-gold-plugin
  CFLAGS += -O3
  CFLAGS += -march=native
endif

# Enable all warnings but unsused functions and labels
CFLAGS += -Wall
CFLAGS += -Werror -Wno-unused-label -Wno-unused-function


================================================
FILE: stms/tinystm/Makefile.common
================================================
# Path to root directory
ROOT ?= .

# CC, LD and AR are builtin-variables of Make (?= is useless in this case)
# To override these defines, you must use "make CC=cc" or change it here
# Linker set by default to the CC definition
LD = $(CC)
# Other tools
DOXYGEN ?= doxygen
UNIFDEF ?= unifdef

# Define global parameters
TM = stm
SRCDIR = $(ROOT)/src
INCDIR = $(ROOT)/include
LIBDIR = $(ROOT)/lib
TMLIB = $(LIBDIR)/lib$(TM).a

# Supposing all compilers has -I -L
# TODO -I$(SRCDIR) only for library build
CPPFLAGS += -I$(INCDIR) -I$(SRCDIR)

# Disable assert for non-debug build
ifneq ($(CFG),debug)
  CPPFLAGS += -DNDEBUG
endif

# TODO Should be only for test binaries
LDFLAGS += -L$(LIBDIR) -l$(TM)

# Only on linux / TODO make source compatible with non-pthread OS
LDFLAGS += -lpthread

# Solaris default memory allocator is quite slow, better use mtmalloc
# LDFLAGS += -lmtmalloc

# Disable additionnal checks from glibc (__longjmp_chk/__printf_chk)
CPPFLAGS += -U_FORTIFY_SOURCE
# CPPFLAGS += -D_FORTIFY_SOURCE=0
# Enable multi-thread support in glibc
CPPFLAGS += -D_REENTRANT

# Rely on the definition of CC to determine the compiler
# if the compiler is not detected correctly, use "gmake COMPILER=your_compiler"
# Default: gcc
COMPILER ?= $(CC)
ifeq ($(COMPILER),icc)
include $(ROOT)/Makefile.icc
else
ifeq ($(COMPILER),suncc)
include $(ROOT)/Makefile.suncc
else
ifeq ($(COMPILER),xlc)
else
ifeq ($(COMPILER),llvm-gcc)
else
ifeq ($(COMPILER),clang)
include $(ROOT)/Makefile.clang
else
include $(ROOT)/Makefile.gcc
endif
endif
endif
endif
endif

########################################################################
# libatomic_ops path
# LIBAO_HOME must be set to the path of libatomic_ops
# (use the embedded light libatomic_ops if LIBAO_HOME is not defined)
########################################################################
ifndef LIBAO_HOME
  LIBAO_HOME = $(SRCDIR)/atomic_ops
  LIBAO_INC = $(LIBAO_HOME)
else
  LIBAO_INC = $(LIBAO_HOME)/include
endif
CPPFLAGS += -I$(LIBAO_INC)


================================================
FILE: stms/tinystm/Makefile.gcc
================================================
# Debug/optimization flags (optimized by default)
ifeq ($(CFG),debug)
  CFLAGS += -O0 -ggdb3
else
  CFLAGS += -O2
  CFLAGS += -march=native
endif

# Disable strict aliasing 
# TODO: no-strict-aliasing removes some optimizations but seems required for correctness. need more investigation.
CFLAGS += -fno-strict-aliasing
# Disable stack smashing protector (__stack_chk_fail)
CFLAGS += -fno-stack-protector
# Enable all warnings but unsused functions and labels
CFLAGS += -Wall -Wno-unused-function -Wno-unused-label
# Enable extra warnings
# CFLAGS += -Wextra
# Link Time Optimization (LDFLAGS also requires optimization flags)
# CFLAGS += -flto
# LDFLAGS += $(CFLAGS) -fwhole-program -fuse-linker-plugin

# Enable profiling mode
# CFLAGS += -fprofile-generate
# LDFLAGS += -fprofile-generate
# Run typical program
# Use the profiling information to compile
# CFLAGS += -fprofile-use
# LDFLAGS += -fprofile-use


================================================
FILE: stms/tinystm/Makefile.icc
================================================
# Intel compiler has a GCC compatibility mode

# Debug/optimization flags (optimized by default)
ifeq ($(CFG),debug)
  CFLAGS += -O0 -ggdb3
else
  CFLAGS += -xHOST -O2
endif

# Full optimizations (IPO, O3, xHOST)
# CFLAGS += -fast

# Disable strict aliasing (remove some optimization but required for correctness? need more investigation)
CFLAGS += -fno-strict-aliasing
# Disable stack smashing protector (__stack_chk_fail)
CFLAGS += -fno-stack-protector
# Enable all warnings but unsused functions and labels
CFLAGS += -Wall
# Enable inlining information
# CFLAGS += -Winline
# Disable intel builtins like _intel_fast_memset
# CFLAGS += -fno-builtin
# Align functions on 16-bytes boundary
# CFLAGS += -falign-functions=16

# Enable profiling mode
# CFLAGS += -prof-gen
# LDFLAGS += -prof-gen
# Run typical program
# Use the profiled information to compile
# CFLAGS += -prof-use
# LDFLAGS += -prof-use


================================================
FILE: stms/tinystm/Makefile.suncc
================================================
# Debug/optimization flags (optimized by default)
ifeq ($(CFG),debug)
  CFLAGS += -g
else
  CFLAGS += -xO3
  CFLAGS += -native
endif

# Enable all optimizations
# CFLAGS += -fast
# Enable Inter-procedural optimization (LTO)
# CFLAGS += -xipo=2
# Enable profiling mode
# CFLAGS += -xprofile=collect
# Run typical program
# Use the profiling information to compile
# CFLAGS += -xprofile=use


================================================
FILE: stms/tinystm/README.md
================================================
TinySTM
=======

OVERVIEW
--------

TinySTM is a lightweight but efficient word-based STM implementation.
This distribution includes three versions of TinySTM: write-back
(updates are buffered until commit time), write-through (updates are
directly written to memory), and commit-time locking (locks are only
acquired upon commit).  The version can be selected by editing the
makefile, which documents all the different compilation options.

TinySTM compiles and runs on 32 or 64-bit architectures.  It was tested
on various flavors of Unix, on Mac OS X, and on Windows using cygwin.
It comes with a few test applications, notably a linked list, a skip
list, and a red-black tree.


INSTALLATION
------------

TinySTM requires the 'atomic\_ops' library, freely available from
[www.hpl.hp.com](http://www.hpl.hp.com/research/linux/atomic_ops/).
A stripped-down version of the library is included in the TinySTM 
distribution.  If you wish to use another version, you must set the 
environment variable LIBAO\_HOME to the installation directory of
'atomic\_ops'.

If your system does not support GCC thread-local storage, modify the
TLS parameter in the 'Makefile' file.

To compile TinySTM libraries, execute 'make' in the main directory.  To
compile test applications, execute 'make test'.  To check the compiled
library, execute 'make check'. 'make clean' will remove all compiled
files.
To compile the TinySTM GCC compatible library, execute 'make abi-gcc'.
To compile test applications, execute 'make abi-gcc-test'.


CONTACT
-------

* E-mail : [tinystm@tinystm.org](mailto:tinystm@tinystm.org)
* Web    : [http://tinystm.org](http://tinystm.org) and
 [http://www.tmware.org](http://www.tmware.org)


ACKNOWLEDGEMENT
---------------

This library was supported by the European research consortium
[VELOX](http://www.velox-project.eu).


================================================
FILE: stms/tinystm/abi/Makefile
================================================

.PHONY:	all gcc dtmc intel clean test check

all:	gcc dtmc intel 

gcc:
	$(MAKE) -C gcc

gcc%:
	$(MAKE) -C gcc $(subst -, , $(subst gcc,,$@))

dtmc:
	$(MAKE) -C dtmc

dtmc%:
	$(MAKE) -C dtmc $(subst -, , $(subst dtmc,,$@))

intel:
	$(MAKE) -C intel

intel%:
	$(MAKE) -C intel $(subst -, , $(subst intel,,$@))

clean:
	$(MAKE) -C test clean
	$(MAKE) -C gcc clean
	$(MAKE) -C dtmc clean
	$(MAKE) -C intel clean

# simple test and check that use explicit calls to gcc libitm library
test:
	$(MAKE) -C test test

check:
	$(MAKE) -C test check

# test-all and check-all need TM compilers (INTEL/DTMC/GCC) in the path 
test-all:
	$(MAKE) -C gcc test
	$(MAKE) -C dtmc test
	$(MAKE) -C intel test

check-all:
	$(MAKE) -C gcc check
	$(MAKE) -C dtmc check
	$(MAKE) -C intel check

gcc/libitm.h: libitm.h.tpl.header libitm.h.tpl.unifdef libitm.h.tpl.cpp libitm.h.tpl.footer
	cat libitm.h.tpl.header > $@
	unifdef -DTM_GCC -UTM_INTEL -UTM_DTMC libitm.h.tpl.unifdef >> $@ || true
	cat libitm.h.tpl.cpp >> $@
	cat libitm.h.tpl.footer >> $@
	sed -i 's/TX_ARGS //g' $@
	sed -i 's/TX_ARG//g' $@

dtmc/libitm.h: libitm.h.tpl.header libitm.h.tpl.unifdef libitm.h.tpl.cpp libitm.h.tpl.footer
	cat libitm.h.tpl.header > $@
	unifdef -UTM_GCC -UTM_INTEL -DTM_DTMC libitm.h.tpl.unifdef >> $@ || true
	cat libitm.h.tpl.cpp >> $@
	cat libitm.h.tpl.footer >> $@
	sed -i 's/TX_ARGS//g' $@
	sed -i 's/TX_ARG//g' $@

intel/libitm.h: libitm.h.tpl.header libitm.h.tpl.unifdef libitm.h.tpl.cpp libitm.h.tpl.footer
	cat libitm.h.tpl.header > $@
	unifdef -UTM_GCC -DTM_INTEL -UTM_DTMC libitm.h.tpl.unifdef >> $@ || true
	cat libitm.h.tpl.cpp >> $@
	cat libitm.h.tpl.footer >> $@
	sed -i 's/TX_ARGS/_ITM_transaction *,/g' $@
	sed -i 's/TX_ARG/_ITM_transaction */g' $@


================================================
FILE: stms/tinystm/abi/Makefile.common
================================================
# Path to tinySTM
ROOT ?= ..

# ROOT must be defined to include Makefile.common
include $(ROOT)/Makefile.common

##############################################################################
# Compilation options.  Note that the definitions from the main makefile
# are not used here, so one can use different options here.
##############################################################################

#TODO test if function are inline or need a specific header to force inlining
#TODO Flags may changed. I must find a way to keep up to date with current tiny. 

# DEFINES += -DDESIGN=WRITE_BACK_ETL
# DEFINES += -DDESIGN=WRITE_BACK_CTL
DEFINES += -DDESIGN=WRITE_THROUGH

DEFINES += -DCM=CM_SUICIDE
# DEFINES += -DCM=CM_DELAY
# DEFINES += -DCM=CM_BACKOFF
# DEFINES += -DCM=CM_MODULAR

#DEFINES += -DEPOCH_GC
DEFINES += -UEPOCH_GC
# DEFINES += -DREAD_LOCKED_DATA

#DEFINES += -DLOCK_SHIFT_EXTRA=0
DEFINES += -DTM_STATISTICS
DEFINES += -UTM_STATISTICS2

# DEFINES += -DNO_STACK_CHECK
# DEFINES += -DTANGER_STATS

DEFINES += -DIRREVOCABLE_ENABLED
# DEFINES += -UIRREVOCABLE_ENABLED

# Add wrapper for pthread function 
# DEFINES += -DPTHREAD_WRAPPER
DEFINES += -UPTHREAD_WRAPPER
# TODO if THREAD_WRAPPER is defined, library must be linked with -ldl

# Define how TLS is used in ABI (should be removed for next release)
DEFINES += -DTLS_COMPILER

##############################################################################
# Do not modify anything below this point!
##############################################################################
# TODO Use libtool to generate libraries
# libtool --mode=compile
# libtool --mode=link
# libtool --mode=install 

# NOTES
# Use explicit parameters for Tanger and Intel STM Compiler

# TODO make it MacOS compatible

# Remove the -DEXPLICIT_TX_PARAMETER flag if defined
# Manage it accordingly to the required library 
# TODO is it useful?
DEF_ABI = $(subst -DEXPLICIT_TX_PARAMETER,,$(DEFINES))

# Rules for intset benchmarks
BINS = intset-hs intset-ll intset-rb intset-sl bank

intset-hs.o:	$(ROOT)/test/intset/intset.c
	$(TESTCC) $(TESTCFLAGS) -DUSE_HASHSET -c -o $@ $<

intset-ll.o:	$(ROOT)/test/intset/intset.c
	$(TESTCC) $(TESTCFLAGS) -DUSE_LINKEDLIST -c -o $@ $<

intset-rb.o:	$(ROOT)/test/intset/intset.c
	$(TESTCC) $(TESTCFLAGS) -DUSE_RBTREE -c -o $@ $<

intset-sl.o:	$(ROOT)/test/intset/intset.c
	$(TESTCC) $(TESTCFLAGS) -DUSE_SKIPLIST -c -o $@ $<

bank.o:	$(ROOT)/test/bank/bank.c
	$(TESTCC) $(TESTCFLAGS) -c -o $@ $<

# FIXME in case of ABI $(TMLIB) must be replaced to abi/...
$(BINS):	%:	%.o all
	$(TESTLD) -o $@ $< $(TESTLDFLAGS) -lpthread

test: 	all $(BINS)

intset-clean:
	rm -f $(BINS)

check: 	test intset-check

intset-check:
	@echo Testing Linked List \(intset-ll\)
	@./intset-ll -d 2000 1>/dev/null 2>&1
	@echo Testing Linked List with concurrency \(intset-ll -n 4\)
	@./intset-ll -d 2000 -n 4 1>/dev/null 2>&1
	@echo Testing Red Black Tree \(intset-rb\)
	@./intset-rb -d 2000 1>/dev/null 2>&1
	@echo Testing Red Black Tree with concurrency \(intset-rb -n 4\)
	@./intset-rb -d 2000 -n 4 1>/dev/null 2>&1
	@echo Testing Skip List \(intset-sl\)
	@./intset-sl -d 2000 1>/dev/null 2>&1
	@echo Testing Skip List with concurrency \(intset-sl -n 4\)
	@./intset-sl -d 2000 -n 4 1>/dev/null 2>&1
	@echo Testing Hash Set \(intset-hs\)
	@./intset-hs -d 2000 1>/dev/null 2>&1
	@echo Testing Hash Set with concurrency \(intset-hs -n 4\)
	@./intset-hs -d 2000 -n 4 1>/dev/null 2>&1
	@echo All tests passed


================================================
FILE: stms/tinystm/abi/abi.c
================================================
/*
 * File:
 *   abi.c
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * Description:
 *   ABI for tinySTM.
 *
 * Copyright (c) 2007-2014.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * This program has a dual license and can also be distributed
 * under the terms of the MIT license.
 */

#define _GNU_SOURCE
#include <alloca.h>
#include <assert.h>
#include <string.h>
#include <stdbool.h>
#include <pthread.h>
#ifdef __SSE__
# include <xmmintrin.h>
#endif /* __SSE__ */

#include "libitm.h"
#include "utils.h"

/* FIXME STACK_CHECK: DTMC and GCC can use ITM_W to write on stack variable */
/* TODO STACK_CHECK: to finish and test */

#include "stm.h"
#include "atomic.h"
#include "mod_cb.h"
#include "mod_mem.h"
#include "mod_log.h"
#include "wrappers.h"
#ifdef TM_DTMC
# include "dtmc/tanger-stm-internal.h"
# include "dtmc/tanger.h"
#endif

/* Indicates to use _ITM_siglongjmp */
#ifdef TM_DTMC
# define CTX_ITM   tanger_stm_restore_stack(); _ITM_siglongjmp
#else
# define CTX_ITM   _ITM_siglongjmp
#endif /* ! TM_DTMC */
extern void _ITM_CALL_CONVENTION _ITM_siglongjmp(int val, sigjmp_buf env) __attribute__ ((noreturn));

#include "stm.c"
#include "mod_cb_mem.c"
#ifdef TM_GCC
# include "gcc/alloc_cpp.c"
#endif
#include "mod_log.c"
#include "wrappers.c"
#ifdef EPOCH_GC
# include "gc.c"
#endif

/* pthread wrapper */
#ifdef PTHREAD_WRAPPER
# include "pthread_wrapper.h"
#endif /* PTHREAD_WRAPPER */

#if defined(TM_GCC) || defined(TM_DTMC)
# define TX_ARG
# define TX_ARGS
# define TX_GET_ABI   stm_tx_t *tx = tls_get_tx()
#elif defined(TM_INTEL)
# define TX_ARG       _ITM_transaction *__td
# define TX_ARGS      _ITM_transaction *__td,
# define TX_GET_ABI   stm_tx_t *tx = (stm_tx_t *)__td
#else
# error "No ABI defined"
#endif

/* ################################################################### *
 * VARIABLES
 * ################################################################### */
/* Status of the ABI */
enum {
  ABI_NOT_INITIALIZED,
  ABI_INITIALIZING,
  ABI_INITIALIZED,
  ABI_FINALIZING,
};

static union {
  struct {
    volatile unsigned long status;
    volatile unsigned long thread_counter;
  };
  uint8_t padding[64]; /* TODO should be cacheline related */
} __attribute__((aligned(64))) global_abi = {{.status = ABI_NOT_INITIALIZED, .thread_counter = 0 }};


typedef struct {
  int thread_id;
#ifdef STACK_CHECK
/* TODO STACK_CHECK could be moved to stm.c and we could calculate a mask to detect if the address is in the stack */
  void *stack_addr_low;
  void *stack_addr_high;
#endif /* STACK_CHECK */
} thread_abi_t;

/* Statistics */
typedef struct stats {
  int thread_id;
  unsigned long nb_commits;
  unsigned long nb_aborts;
  double nb_retries_avg;
  unsigned long nb_retries_min;
  unsigned long nb_retries_max;

  struct stats * next;
} stats_t;

/* Thread statistics managed as a linked list */
/* TODO align + padding */
stats_t * thread_stats = NULL;


/* ################################################################### *
 * COMPATIBILITY FUNCTIONS
 * ################################################################### */
#ifdef STACK_CHECK
static int get_stack_attr(void *low, void *high)
{
  /* GNU Pthread specific */
  pthread_attr_t attr;
  uintptr_t stackaddr;
  size_t stacksize;
  if (pthread_getattr_np(pthread_self(), &attr)) {
    return 1;
  }
  if (pthread_attr_getstack(&attr, (void *)&stackaddr, &stacksize)) {
    return 1;
  }
  *(uintptr_t *)low = stackaddr;
  *(uintptr_t *)high = stackaddr + stacksize;

  return 0;
}
/* Hints for other platforms
#if PLATFORM(DARWIN) 
pthread_get_stackaddr_np(pthread_self()); 
#endif
#elif PLATFORM(WIN_OS) && PLATFORM(X86) && COMPILER(MSVC) 
// offset 0x18 from the FS segment register gives a pointer to 
// the thread information block for the current thread 
NT_TIB* pTib; 
__asm { 
MOV EAX, FS:[18h] 
MOV pTib, EAX 
} 
return (void*)pTib->StackBase; 
#elif PLATFORM(WIN_OS) && PLATFORM(X86_64) && COMPILER(MSVC) 
PNT_TIB64 pTib = reinterpret_cast<PNT_TIB64>(NtCurrentTeb()); 
return (void*)pTib->StackBase; 
#elif PLATFORM(WIN_OS) && PLATFORM(X86) && COMPILER(GCC) 
// offset 0x18 from the FS segment register gives a pointer to 
// the thread information block for the current thread 
NT_TIB* pTib; 
asm ( "movl %%fs:0x18, %0\n" 
: "=r" (pTib) 
); 
return (void*)pTib->StackBase; 
#endif
*/
static INLINE int on_stack(void *a)
{
#ifdef TLS
  thread_abi_t *t = thread_abi;
#else /* ! TLS */
  thread_abi_t *t = pthread_getspecific(thread_abi);
#endif /* ! TLS */
  if ((t->stack_addr_low <= (uintptr_t)a) && ((uintptr_t)a < t->stack_addr_high)) { 
    return 1;
  } 
  return 0;
}
#endif /* STACK_CHECK */


#if defined(__APPLE__)
/* OS X */
# include <malloc/malloc.h>
INLINE size_t block_size(void *ptr)
{
  return malloc_size(ptr);
}
#elif defined(__linux__) || defined(__CYGWIN__)
/* Linux, WIN32 (CYGWIN) */
# include <malloc.h>
INLINE size_t block_size(void *ptr)
{
  return malloc_usable_size(ptr);
}
#else /* ! (defined(__APPLE__) || defined(__linux__) || defined(__CYGWIN__)) */
# error "Target OS does not provide size of allocated blocks"
#endif /* ! (defined(__APPLE__) || defined(__linux__) || defined(__CYGWIN__)) */


/* ################################################################### *
 * 
 * ################################################################### */
static void abi_init(void);

static INLINE stm_tx_t *
abi_init_thread(void)
{
  stm_tx_t *tx = tls_get_tx();
  if (tx == NULL) {
    /* Make sure that the main initilization is done */
    if (ATOMIC_LOAD(&global_abi.status) != ABI_INITIALIZED)
      _ITM_initializeProcess();
    //t->thread_id = (int)ATOMIC_FETCH_INC_FULL(&global_abi.thread_counter);
    tx = stm_init_thread();
#ifdef STACK_CHECK
    get_stack_attr(&t->stack_addr_low, &t->stack_addr_high);
#endif /* STACK_CHECK */
  }
  return tx;
}

static void
abi_exit_thread(struct stm_tx *tx)
{
  if (tx == NULL)
    return;
#if 0
  /* FIXME disable during refactoring */
  if (getenv("ITM_STATISTICS") != NULL) {
    stats_t * ts = malloc(sizeof(stats_t));
#ifdef TLS
    thread_abi_t *t = thread_abi;
#else /* ! TLS */
    thread_abi_t *t = pthread_getspecific(thread_abi);
#endif /* ! TLS */
    ts->thread_id = t->thread_id;
    stm_get_local_stats("nb_commits", &ts->nb_commits);
    stm_get_local_stats("nb_aborts", &ts->nb_aborts);
    stm_get_local_stats("nb_retries_avg", &ts->nb_retries_avg);
    stm_get_local_stats("nb_retries_min", &ts->nb_retries_min);
    stm_get_local_stats("nb_retries_max", &ts->nb_retries_max);
    /* Register thread-statistics to global */
    do {
	ts->next = (stats_t *)ATOMIC_LOAD(&thread_stats);
    } while (ATOMIC_CAS_FULL(&thread_stats, ts->next, ts) == 0);
    /* ts will be freed on _ITM_finalizeProcess. */
#ifdef TLS
    thread_abi = NULL;
#else /* ! TLS */
    pthread_setspecific(thread_abi, NULL);
#endif
    /* Free thread_abi_t structure. */
    free(t);
  }
#endif

  stm_exit_thread();

#ifdef TM_DTMC
  /* Free the saved stack */
  tanger_stm_free_stack();
#endif
}

static INLINE void
abi_init(void)
{
  /* thread safe */
reload:
  if (ATOMIC_LOAD_ACQ(&global_abi.status) == ABI_NOT_INITIALIZED) {
    if (ATOMIC_CAS_FULL(&global_abi.status, ABI_NOT_INITIALIZED, ABI_INITIALIZING) != 0) {
      /* TODO temporary to be sure to use tinySTM */
      printf("TinySTM-ABI v%s.\n", _ITM_libraryVersion());
      atexit((void (*)(void))(_ITM_finalizeProcess));

      /* TinySTM initialization */
      stm_init();
      mod_mem_init(0);
# ifdef TM_GCC
      mod_alloc_cpp();
# endif /* TM_GCC */
      mod_log_init();
      mod_cb_init();
      ATOMIC_STORE(&global_abi.status, ABI_INITIALIZED);
      /* Also initialize thread as specify in the specification */
      abi_init_thread();
      return; 
    } else {
      goto reload;
    }
  } else if (ATOMIC_LOAD_ACQ(&global_abi.status) != ABI_INITIALIZED) {
    /* Wait the end of the initialization */
    goto reload;
  }

  return;
}

static INLINE void
abi_exit(void)
{
  TX_GET;
  char * statistics;

  abi_exit_thread(tx);

  /* Ensure thread safety */
reload:
  if (ATOMIC_LOAD_ACQ(&global_abi.status) == ABI_INITIALIZED) {
    if (ATOMIC_CAS_FULL(&global_abi.status, ABI_INITIALIZED, ABI_FINALIZING) == 0)
      goto reload;
  } else {
    return;
  }

  if ((statistics = getenv("ITM_STATISTICS")) != NULL) {
    FILE * f;
    int i = 0;
    stats_t * ts;
    if (statistics[0] == '-')
      f = stdout;
    else if ((f = fopen("itm.log", "w")) == NULL) {
      fprintf(stderr, "can't open itm.log for writing\n");
      goto finishing;
    }
    fprintf(f, "STATS REPORT\n");
    fprintf(f, "THREAD TOTALS\n");

    while (1) {
      do {
        ts = (stats_t *)ATOMIC_LOAD(&thread_stats);
	if (ts == NULL)
	  goto no_more_stat;
      } while(ATOMIC_CAS_FULL(&thread_stats, ts, ts->next) == 0);
      /* Skip stats if not a transactional thread */
      if (ts->nb_commits == 0)
        continue;
      fprintf(f, "Thread %-4i                : %12s %12s %12s %12s\n", i, "Min", "Mean", "Max", "Total");
      fprintf(f, "  Transactions             : %12lu\n", ts->nb_commits);
      fprintf(f, "  %-25s: %12lu %12.2f %12lu %12lu\n", "Retries", ts->nb_retries_min, ts->nb_retries_avg, ts->nb_retries_max, ts->nb_aborts);
      fprintf(f,"\n");
      /* Free the thread stats structure */
      free(ts);
      i++;
    }
no_more_stat:
    if (f != stdout) {
      fclose(f);
    }
  }
finishing:
  stm_exit();

  ATOMIC_STORE(&global_abi.status, ABI_NOT_INITIALIZED);
}

/* ################################################################### *
 * FUNCTIONS
 * ################################################################### */

_ITM_transaction * _ITM_CALL_CONVENTION _ITM_getTransaction(void)
{
  struct stm_tx *tx = tls_get_tx();
  if (unlikely(tx == NULL)) {
    /* Thread not initialized: must create transaction */
    tx = abi_init_thread();
  }

  return (_ITM_transaction *)tx;
}

_ITM_howExecuting _ITM_CALL_CONVENTION _ITM_inTransaction(TX_ARG)
{
  TX_GET_ABI;
  if (stm_irrevocable_tx(tx))
    return inIrrevocableTransaction;
  if (stm_active_tx(tx))
    return inRetryableTransaction;
  return outsideTransaction;
}

int _ITM_CALL_CONVENTION _ITM_getThreadnum(void)
{
  /* FIXME should be done in stm? */
  return (int)pthread_self();
}

void _ITM_CALL_CONVENTION _ITM_addUserCommitAction(TX_ARGS
                              _ITM_userCommitFunction __commit,
                              _ITM_transactionId resumingTransactionId,
                              void *__arg)
{
  stm_on_commit(__commit, __arg);
}

void _ITM_CALL_CONVENTION _ITM_addUserUndoAction(TX_ARGS
                            const _ITM_userUndoFunction __undo, void * __arg)
{
  stm_on_abort(__undo, __arg);
}

/*
 * Specification: The getTransactionId function returns a sequence number for
 * the current transaction. Within a transaction, nested transactions are 
 * numbered sequentially in the order in which they start, with the outermost
 * transaction getting the lowest number, and non-transactional code the value
 * _ITM_NoTransactionId, which is less than any transaction id for 
 * transactional code.
 */
_ITM_transactionId _ITM_CALL_CONVENTION _ITM_getTransactionId(TX_ARG)
{
  TX_GET_ABI;
  if (tx == NULL)
    return _ITM_noTransactionId;
  /* Note that _ITM_noTransactionId is 1 */
  return (_ITM_transactionId)(tx->nesting + 1);
}

void _ITM_CALL_CONVENTION _ITM_dropReferences(TX_ARGS const void *__start,
                                              size_t __size)
{
  fprintf(stderr, "%s: not yet implemented\n", __func__);
}

void _ITM_CALL_CONVENTION _ITM_userError(const char *errString, int exitCode)
{
  fprintf(stderr, "%s", errString);
  exit(exitCode);
}

const char * _ITM_CALL_CONVENTION _ITM_libraryVersion(void)
{
  return _ITM_VERSION_NO_STR " using TinySTM " STM_VERSION "";
}

int _ITM_CALL_CONVENTION _ITM_versionCompatible(int version)
{
  return version == _ITM_VERSION_NO;
}

int _ITM_CALL_CONVENTION _ITM_initializeThread(void)
{
  abi_init_thread();
  return 0;
}

void _ITM_CALL_CONVENTION _ITM_finalizeThread(void)
{
  TX_GET;
  abi_exit_thread(tx);
}

#ifdef __PIC__
/* Add call when the library is loaded and unloaded */
#define ATTR_CONSTRUCTOR __attribute__ ((constructor))  
#define ATTR_DESTRUCTOR __attribute__ ((destructor))  
#else
#define ATTR_CONSTRUCTOR 
#define ATTR_DESTRUCTOR 
#endif

void ATTR_DESTRUCTOR _ITM_CALL_CONVENTION _ITM_finalizeProcess(void)
{
  abi_exit();
}

int ATTR_CONSTRUCTOR _ITM_CALL_CONVENTION _ITM_initializeProcess(void)
{
  abi_init();
  return 0;
}

void _ITM_CALL_CONVENTION _ITM_error(const _ITM_srcLocation *__src, int errorCode)
{
  fprintf(stderr, "Error: %s (%d)\n", (__src == NULL || __src->psource == NULL ? "?" : __src->psource), errorCode);
  exit(1);
}

/* The _ITM_beginTransaction is defined in assembly (arch.S)  */
uint32_t _ITM_CALL_CONVENTION GTM_begin_transaction(TX_ARGS uint32_t attr, jmp_buf * buf) 
{
  /* FIXME first time return a_saveLiveVariable +> siglongjmp must return a_restoreLiveVariable (and set a_saveLiveVariable)
   *       check a_abortTransaction attr
   * */
  TX_GET_ABI;
  uint32_t ret;
  sigjmp_buf * env;

  /* This variable is in the stack but stm_start copies the content. */
  stm_tx_attr_t _a = (stm_tx_attr_t)0;

#ifdef TM_GCC
  /* GCC does not call initializeProcess TODO: other fix possible? */
  if (unlikely(tx == NULL)) {
    /* Thread not initialized: must create transaction */
    tx = abi_init_thread();
  }
#endif
  assert(tx != NULL);

#ifdef TM_DTMC
  /* FIXME this should be fixed with stdcall conv call */
  /* DTMC prior or equal to Velox R3 did not use regparm(2) with x86-32. */
  /* TODO to be removed when new release of DTMC fix it. */
  /* attr = 3; */
  ret = a_runInstrumentedCode;
#else /* !TM_DTMC */
  /* Manage attribute for the transaction */
  if (unlikely((attr & pr_doesGoIrrevocable) || !(attr & pr_instrumentedCode))) {
    /* TODO Add an attribute to specify irrevocable TX */
    stm_set_irrevocable_tx(tx, 1);
    ret = a_runUninstrumentedCode;
    if ((attr & pr_multiwayCode) == pr_instrumentedCode)
      ret = a_runInstrumentedCode;
  } else {
#ifdef TM_GCC
    if (attr & pr_readOnly)
      _a.read_only = 1;
#endif /* TM_GCC */

    ret = a_runInstrumentedCode | a_saveLiveVariables;
  }
#endif /* !TM_DTMC */

#ifdef TM_DTMC
  /* if (ret & a_runInstrumentedCode) */
  tanger_stm_save_stack();
#endif /* TM_DTMC */

  env = int_stm_start(tx, _a);
  /* Save thread context only when outermost transaction */
  /* TODO check that the memcpy is fast. */
  if (likely(env != NULL))
    memcpy(env, buf, sizeof(jmp_buf)); /* TODO limit size to real size */

  return ret;
}

void _ITM_CALL_CONVENTION _ITM_commitTransaction(
#if defined(TM_GCC)
                             void
#else /* !TM_GCC */
                             TX_ARGS const _ITM_srcLocation *__src
#endif /* !TM_GCC */
                             )
{
  TX_GET_ABI;
  int_stm_commit(tx);
#ifdef TM_DTMC
  tanger_stm_reset_stack();
#endif /* TM_DTMC */
}

bool _ITM_CALL_CONVENTION _ITM_tryCommitTransaction(TX_ARGS
                                   const _ITM_srcLocation *__src)
{
  TX_GET_ABI;
  return (int_stm_commit(tx) != 0);
}

/**
 * Commits all inner transactions nested within the transaction specified by
 * the transaction id parameter.
 */
void _ITM_CALL_CONVENTION _ITM_commitTransactionToId(TX_ARGS
                              const _ITM_transactionId tid,
                              const _ITM_srcLocation *__src)
{
  TX_GET_ABI;
  while ((tx->nesting + 1) > tid)
    int_stm_commit(tx);
}

/* TODO: add noreturn attribute. */
void _ITM_CALL_CONVENTION _ITM_abortTransaction(TX_ARGS
                              _ITM_abortReason __reason,
                              const _ITM_srcLocation *__src)
{
  TX_GET_ABI;
  if( __reason == userAbort) {
    /* __tm_abort was invoked. */
    __reason = STM_ABORT_NO_RETRY;
  } else if(__reason == userRetry) {
    /* __tm_retry was invoked. */
    __reason = STM_ABORT_EXPLICIT;
  }
  stm_rollback(tx, __reason);
}

/* TODO: add noreturn attribute. */
void _ITM_CALL_CONVENTION _ITM_rollbackTransaction(TX_ARGS
                              const _ITM_srcLocation *__src)
{
  /* TODO check exactly the purpose of this function */
  TX_GET_ABI;
  stm_rollback(tx, STM_ABORT_EXPLICIT);
}


void _ITM_CALL_CONVENTION _ITM_registerThrownObject(TX_ARGS
                              const void *__obj, size_t __size)
{
  // TODO A rollback of the tx will not roll back the registered object
  fprintf(stderr, "%s: not yet implemented\n", __func__);
}

void _ITM_CALL_CONVENTION _ITM_changeTransactionMode(TX_ARGS
                              _ITM_transactionState __mode,
                              const _ITM_srcLocation *__loc)
{
  /* FIXME: it seems there is a problem with irrevocable and intel c */
  switch (__mode) {
    case modeSerialIrrevocable:
      stm_set_irrevocable(1);
      /* TODO a_runUninstrumentedCode must be set at rollback! */
      break;
    case modeObstinate:
    case modeOptimistic:
    case modePessimistic:
    default:
	fprintf(stderr, "This mode %d is not implemented yet\n", __mode);
  }
}

#if defined(TM_GCC) || defined(TM_DTMC)
void * _ITM_malloc(size_t size)
{
  stm_tx_t *tx = tls_get_tx();
  if (tx == NULL || !stm_active_tx(tx))
    return malloc(size);
  return stm_malloc_tx(tx, size);
}

void * _ITM_calloc(size_t nm, size_t size)
{
  stm_tx_t *tx = tls_get_tx();
  if (tx == NULL || !stm_active_tx(tx))
    return calloc(nm, size);
  return stm_calloc_tx(tx, nm, size);
}

void _ITM_free(void *ptr)
{
  stm_tx_t *tx = tls_get_tx();
  if (tx == NULL || !stm_active_tx(tx)) {
    free(ptr);
    return;
  }
#ifdef NO_WRITE_ON_FREE
  stm_free_tx(tx, ptr, 0);
#else
  stm_free_tx(tx, ptr, block_size(ptr));
#endif
}
#endif /* defined(TM_GCC) || defined(TM_DTMC) */


#ifdef TM_GCC
# include "gcc/clone.c"
# include "gcc/eh.c"

/* TODO This function is not fully compatible, need to delete exception
 * on abort. */
void _ITM_CALL_CONVENTION _ITM_commitTransactionEH(void *exc_ptr)
{
  TX_GET_ABI;
  int_stm_commit(tx);
}
#endif /* TM_GCC */

#ifdef TM_INTEL
# include "intel/alloc.c"
#endif


/**** LOAD STORE LOG FUNCTIONS ****/

#define TM_LOAD(F, T, WF, WT) \
  T _ITM_CALL_CONVENTION F(TX_ARGS const T *addr) \
  { \
    return (WT)WF((volatile WT *)addr); \
  }

#define TM_LOAD_GENERIC(F, T) \
  T _ITM_CALL_CONVENTION F(TX_ARGS const T *addr) \
  { \
    union { T d; uint8_t s[sizeof(T)]; } c; \
    stm_load_bytes((volatile uint8_t *)addr, c.s, sizeof(T)); \
    return c.d; \
  }

/* TODO if WRITE_BACK/ALL?, write to stack must be saved and written directly
 * TODO must use stm_log is addresses are under the beginTransaction
  if (on_stack(addr)) { stm_log_u64(addr); *addr = val; } 
not enough because if we abort and restore -> stack can be corrupted
*/
#ifdef STACK_CHECK
#define TM_STORE(F, T, WF, WT) \
  void _ITM_CALL_CONVENTION F(TX_ARGS const T *addr, T val) \
  { \
    if (on_stack(addr)) *((T*)addr) = val; \
    else WF((volatile WT *)addr, (WT)val); \
  }
#else /* !STACK_CHECK */
#define TM_STORE(F, T, WF, WT) \
  void _ITM_CALL_CONVENTION F(TX_ARGS const T *addr, T val) \
  { \
    WF((volatile WT *)addr, (WT)val); \
  }
#endif /* !STACK_CHECK */

#define TM_STORE_GENERIC(F, T) \
  void _ITM_CALL_CONVENTION F(TX_ARGS const T *addr, T val) \
  { \
    union { T d; uint8_t s[sizeof(T)]; } c; \
    c.d = val; \
    stm_store_bytes((volatile uint8_t *)addr, c.s, sizeof(T)); \
  }

#define TM_LOG(F, T, WF, WT) \
  void _ITM_CALL_CONVENTION F(TX_ARGS const T *addr) \
  { \
    WF((WT *)addr); \
  }

#define TM_LOG_GENERIC(F, T) \
  void _ITM_CALL_CONVENTION F(TX_ARGS const T *addr) \
  { \
    stm_log_bytes((uint8_t *)addr, sizeof(T));    \
  }

#define TM_STORE_BYTES(F) \
  void _ITM_CALL_CONVENTION F(TX_ARGS void *dst, const void *src, size_t size) \
  { \
    stm_store_bytes((volatile uint8_t *)dst, (uint8_t *)src, size); \
  }

#define TM_LOAD_BYTES(F) \
  void _ITM_CALL_CONVENTION F(TX_ARGS void *dst, const void *src, size_t size) \
  { \
    stm_load_bytes((volatile uint8_t *)src, (uint8_t *)dst, size); \
  }

#define TM_LOG_BYTES(F) \
  void _ITM_CALL_CONVENTION F(TX_ARGS const void *addr, size_t size) \
  { \
    stm_log_bytes((uint8_t *)addr, size); \
  }

#ifdef STACK_CHECK
#define TM_SET_BYTES(F) \
  void _ITM_CALL_CONVENTION F(TX_ARGS void *dst, int val, size_t count) \
  { \
    if (on_stack(dst)) memset(dst, val, count); \
    else stm_set_bytes((volatile uint8_t *)dst, val, count); \
  }
#else /* !STACK_CHECK */
#define TM_SET_BYTES(F) \
  void _ITM_CALL_CONVENTION F(TX_ARGS void *dst, int val, size_t count) \
  { \
    stm_set_bytes((volatile uint8_t *)dst, val, count); \
  }
#endif /* !STACK_CHECK */

#ifdef STACK_CHECK
#define TM_COPY_BYTES(F) \
  void _ITM_CALL_CONVENTION F(TX_ARGS void *dst, const void *src, size_t size) \
  { \
    uint8_t *buf = (uint8_t *)alloca(size); \
    if (on_stack(src)) memcpy(buf, src, size); \
    stm_load_bytes((volatile uint8_t *)src, buf, size); \
    if (on_stack(dst)) memcpy(dst, buf, size); \
    else stm_store_bytes((volatile uint8_t *)dst, buf, size); \
  }
#else /* !STACK_CHECK */
#define TM_COPY_BYTES(F) \
  void _ITM_CALL_CONVENTION F(TX_ARGS void *dst, const void *src, size_t size) \
  { \
    uint8_t *buf = (uint8_t *)alloca(size); \
    stm_load_bytes((volatile uint8_t *)src, buf, size); \
    stm_store_bytes((volatile uint8_t *)dst, buf, size); \
  }
#endif /* !STACK_CHECK */

#define TM_COPY_BYTES_RN_WT(F) \
  void _ITM_CALL_CONVENTION F(TX_ARGS void *dst, const void *src, size_t size) \
  { \
    uint8_t *buf = (uint8_t *)alloca(size); \
    memcpy(buf, src, size); \
    stm_store_bytes((volatile uint8_t *)dst, buf, size); \
  }

#define TM_COPY_BYTES_RT_WN(F) \
  void _ITM_CALL_CONVENTION F(TX_ARGS void *dst, const void *src, size_t size) \
  { \
    uint8_t *buf = (uint8_t *)alloca(size); \
    stm_load_bytes((volatile uint8_t *)src, buf, size); \
    memcpy(dst, buf, size); \
  }

#define TM_LOAD_ALL(E, T, WF, WT) \
  TM_LOAD(_ITM_R##E, T, WF, WT) \
  TM_LOAD(_ITM_RaR##E, T, WF, WT) \
  TM_LOAD(_ITM_RaW##E, T, WF, WT) \
  TM_LOAD(_ITM_RfW##E, T, WF, WT)

#define TM_LOAD_GENERIC_ALL(E, T) \
  TM_LOAD_GENERIC(_ITM_R##E, T) \
  TM_LOAD_GENERIC(_ITM_RaR##E, T) \
  TM_LOAD_GENERIC(_ITM_RaW##E, T) \
  TM_LOAD_GENERIC(_ITM_RfW##E, T)

#define TM_STORE_ALL(E, T, WF, WT) \
  TM_STORE(_ITM_W##E, T, WF, WT) \
  TM_STORE(_ITM_WaR##E, T, WF, WT) \
  TM_STORE(_ITM_WaW##E, T, WF, WT)

#define TM_STORE_GENERIC_ALL(E, T) \
  TM_STORE_GENERIC(_ITM_W##E, T) \
  TM_STORE_GENERIC(_ITM_WaR##E, T) \
  TM_STORE_GENERIC(_ITM_WaW##E, T)


/* TODO U1 U2 should not use the inline stm_load to increase locality */
TM_LOAD_ALL(U1, uint8_t, int_stm_load_u8, uint8_t)
TM_LOAD_ALL(U2, uint16_t, int_stm_load_u16, uint16_t)
TM_LOAD_ALL(U4, uint32_t, int_stm_load_u32, uint32_t)
TM_LOAD_ALL(U8, uint64_t, int_stm_load_u64, uint64_t)
TM_LOAD_ALL(F, float, stm_load_float, float)
TM_LOAD_ALL(D, double, stm_load_double, double)
#ifdef __SSE__
TM_LOAD_GENERIC_ALL(M64, __m64)
TM_LOAD_GENERIC_ALL(M128, __m128)
#endif /* __SSE__ */
TM_LOAD_GENERIC_ALL(CF, float _Complex)
TM_LOAD_GENERIC_ALL(CD, double _Complex)
TM_LOAD_GENERIC_ALL(CE, long double _Complex)

TM_STORE_ALL(U1, uint8_t, int_stm_store_u8, uint8_t)
TM_STORE_ALL(U2, uint16_t, int_stm_store_u16, uint16_t)
TM_STORE_ALL(U4, uint32_t, int_stm_store_u32, uint32_t)
TM_STORE_ALL(U8, uint64_t, int_stm_store_u64, uint64_t)
TM_STORE_ALL(F, float, stm_store_float, float)
TM_STORE_ALL(D, double, stm_store_double, double)
#ifdef __SSE__
TM_STORE_GENERIC_ALL(M64, __m64)
TM_STORE_GENERIC_ALL(M128, __m128)
#endif /* __SSE__ */
TM_STORE_GENERIC_ALL(CF, float _Complex)
TM_STORE_GENERIC_ALL(CD, double _Complex)
TM_STORE_GENERIC_ALL(CE, long double _Complex)

TM_STORE_BYTES(_ITM_memcpyRnWt)
TM_STORE_BYTES(_ITM_memcpyRnWtaR)
TM_STORE_BYTES(_ITM_memcpyRnWtaW)

TM_LOAD_BYTES(_ITM_memcpyRtWn)
TM_LOAD_BYTES(_ITM_memcpyRtaRWn)
TM_LOAD_BYTES(_ITM_memcpyRtaWWn)

TM_COPY_BYTES(_ITM_memcpyRtWt)
TM_COPY_BYTES(_ITM_memcpyRtWtaR)
TM_COPY_BYTES(_ITM_memcpyRtWtaW)
TM_COPY_BYTES(_ITM_memcpyRtaRWt)
TM_COPY_BYTES(_ITM_memcpyRtaRWtaR)
TM_COPY_BYTES(_ITM_memcpyRtaRWtaW)
TM_COPY_BYTES(_ITM_memcpyRtaWWt)
TM_COPY_BYTES(_ITM_memcpyRtaWWtaR)
TM_COPY_BYTES(_ITM_memcpyRtaWWtaW)

TM_LOG(_ITM_LU1, uint8_t, stm_log_u8, uint8_t)
TM_LOG(_ITM_LU2, uint16_t, stm_log_u16, uint16_t)
TM_LOG(_ITM_LU4, uint32_t, stm_log_u32, uint32_t)
TM_LOG(_ITM_LU8, uint64_t, stm_log_u64, uint64_t)
TM_LOG(_ITM_LF, float, stm_log_float, float)
TM_LOG(_ITM_LD, double, stm_log_double, double)
TM_LOG_GENERIC(_ITM_LE, long double)
#ifdef __SSE__
TM_LOG_GENERIC(_ITM_LM64, __m64)
TM_LOG_GENERIC(_ITM_LM128, __m128)
#endif /* __SSE__ */
TM_LOG_GENERIC(_ITM_LCF, float _Complex)
TM_LOG_GENERIC(_ITM_LCD, double _Complex)
TM_LOG_GENERIC(_ITM_LCE, long double _Complex)

TM_LOG_BYTES(_ITM_LB)

TM_SET_BYTES(_ITM_memsetW)
TM_SET_BYTES(_ITM_memsetWaR)
TM_SET_BYTES(_ITM_memsetWaW)

TM_COPY_BYTES_RN_WT(_ITM_memmoveRnWt)
TM_COPY_BYTES_RN_WT(_ITM_memmoveRnWtaR)
TM_COPY_BYTES_RN_WT(_ITM_memmoveRnWtaW)

TM_COPY_BYTES_RT_WN(_ITM_memmoveRtWn)
TM_COPY_BYTES_RT_WN(_ITM_memmoveRtaRWn)
TM_COPY_BYTES_RT_WN(_ITM_memmoveRtaWWn)

TM_COPY_BYTES(_ITM_memmoveRtWt)
TM_COPY_BYTES(_ITM_memmoveRtWtaR)
TM_COPY_BYTES(_ITM_memmoveRtWtaW)
TM_COPY_BYTES(_ITM_memmoveRtaRWt)
TM_COPY_BYTES(_ITM_memmoveRtaRWtaR)
TM_COPY_BYTES(_ITM_memmoveRtaRWtaW)
TM_COPY_BYTES(_ITM_memmoveRtaWWt)
TM_COPY_BYTES(_ITM_memmoveRtaWWtaR)
TM_COPY_BYTES(_ITM_memmoveRtaWWtaW)

#ifdef TM_DTMC
/* DTMC file uses this macro name for other thing */
# undef TM_LOAD
# undef TM_STORE
# include "dtmc/tanger.c"
#endif /* TM_DTMC */


================================================
FILE: stms/tinystm/abi/arch_x86.S
================================================
/* Copyright (C) 2008, 2009 Free Software Foundation, Inc.
   Contributed by Richard Henderson <rth@redhat.com>.

   This file is part of the GNU Transactional Memory Library (libitm).

   Libitm is free software; you can redistribute it and/or modify it
   under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3 of the License, or
   (at your option) any later version.

   Libitm is distributed in the hope that it will be useful, but WITHOUT ANY
   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
   more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.  */
/* Modify for TinySTM and Intel STM Compiler */

/* Linux specific, Windows uses a different calling convention. */
#ifndef __linux__
#error This file is linux specific.
#endif


	.text
	.p2align 4
	.globl	_ITM_beginTransaction
	.type	_ITM_beginTransaction, @function

_ITM_beginTransaction:
	.cfi_startproc
#ifdef __LP64__
/* Paramters (in order) is in rdi, rsi, rdx, rcx, r8, r9 */
/* Temporary registers is r10, r11 (not saved) */
/* To be saved are rbx, rsp, rbp, r12, r13, r14, r15 */
/* push register can be also used */
	leaq	8(%rsp), %rax     /* Save stack pointer */ 
	movq	(%rsp), %r8       /* Save return address */
#if defined(EXPLICIT_TX_PARAMETER)
/* NOTE we already have transaction descriptor, we copy directly to tx->env (first field of stm_tx_t) and not on the stack */
	movq	%rax, (%rdi)      /* rax is the original stack pointer */
	movq	%r8, 8(%rdi)      /* r8 is the return address */
	movq	%rbx, 16(%rdi)    /* saves all registers needed */
	movq	%rbp, 24(%rdi)
	movq	%r12, 32(%rdi)
	movq	%r13, 40(%rdi)
	movq	%r14, 48(%rdi)
	movq	%r15, 56(%rdi)
        movq    %rdi, %rdx    /* set 3rd parameter, 2nd and 1st is untouched */
	call	GTM_begin_transaction
#else
	subq	$72, %rsp         /* Allocate space from the stack */
	.cfi_def_cfa_offset 80    /* Debug info */
	movq	%rax, (%rsp)      /* rax is the original stack pointer */
	movq	%r8, 8(%rsp)      /* r8 is the return address */
	movq	%rbx, 16(%rsp)    /* saves all registers needed */
	movq	%rbp, 24(%rsp)
	movq	%r12, 32(%rsp)
	movq	%r13, 40(%rsp)
	movq	%r14, 48(%rsp)
	movq	%r15, 56(%rsp)
	movq	%rsp, %rsi    /* set 2nd paramter (pointer to the stack where registers are saved), 1st is untouched (rdi) */
	call	GTM_begin_transaction
	addq	$72, %rsp         /* Remove allocated space from the stack */
#endif
	.cfi_def_cfa_offset 8
	ret
#else /* !LP64  */
/* NOTE attribute regparm(2) has to be set for all itm function and push 3rd argument on stack with icc */
/* fastcall convention argument: eax, edx, then on the stack (GCC specific) */
/* TODO: only tested with Intel (tanger could use another calling convention) */
	/* eax: tx, edx: attr, stack: __src */
#if defined(EXPLICIT_TX_PARAMETER)
	leal	4(%esp), %ecx     /* Get esp (stack pointer) */
	movl	%ecx, (%eax)      /* Save stack pointer */
	movl	(%esp), %ecx      /* Get return address */
	movl	%ebx, 4(%eax)     /* Save registers */
	movl	%esi, 8(%eax)
	movl	%edi, 12(%eax)
	movl	%ebp, 16(%eax)
	movl	%ecx, 20(%eax)    /* Save return address */
// NOTE third argument is not handle in this case
	call	GTM_begin_transaction
#else
	leal	4(%esp), %ecx
	subl	$28, %esp         /* Reserve space on the stack */
	.cfi_def_cfa_offset 32
	movl	%ecx, 8(%esp)
	movl	%ebx, 12(%esp)    /* Save registers */
	movl	%esi, 16(%esp)
	movl	%edi, 20(%esp)
	movl	%ebp, 24(%esp)
	leal	8(%esp), %edx
	call	GTM_begin_transaction
	addl	$28, %esp
#endif /* EXPLICIT_TX_PARAMETER  */
	.cfi_def_cfa_offset 4
	ret
#endif /* !LP64 */
	.cfi_endproc
	.size	_ITM_beginTransaction, .-_ITM_beginTransaction


	.p2align 4
	.globl	_ITM_siglongjmp
	.type	_ITM_siglongjmp, @function
	.hidden	_ITM_siglongjmp

_ITM_siglongjmp:
	.cfi_startproc
#ifdef __LP64__
	movq	(%rdi), %rcx
	movq	8(%rdi), %rdx
	movq	16(%rdi), %rbx   /* Restore registers */
	movq	24(%rdi), %rbp
	movq	32(%rdi), %r12
	movq	40(%rdi), %r13
	movq	48(%rdi), %r14
	movq	56(%rdi), %r15
	movl	%esi, %eax       /* 2nd paramter (esi) is returned value (eax, int, 32 bit) */
	.cfi_def_cfa %rcx, 0
	.cfi_register %rip, %rdx
	movq	%rcx, %rsp
	jmp	*%rdx
#else
	xchgl	%eax, %edx
	movl	(%edx), %ecx      /* esp (stack pointer) will be restored later */
	movl	4(%edx), %ebx     /* restore registers */
	movl	8(%edx), %esi
	movl	12(%edx), %edi
	movl	16(%edx), %ebp
	movl	20(%edx), %edx    /* eip (instruction pointer) will be restored after */
	.cfi_def_cfa %ecx, 0
	.cfi_register %eip, %edx
	movl	%ecx, %esp        /* restore esp, stack pointer */
	jmp	*%edx             /* restore eip, instruction pointer */
#endif
	.cfi_endproc
	.size	_ITM_siglongjmp, .-_ITM_siglongjmp

// TODO Defined this elsewhere
#if defined(TM_INTEL) && !defined(__PIC__) 

	.p2align 4
	.globl	malloc._$WrapTXN
	.type	malloc._$WrapTXN, @function
	.hidden	malloc._$WrapTXN
	.globl	malloc._$TXN
	.type	malloc._$TXN, @function
	.hidden	malloc._$TXN

malloc._$WrapTXN:
malloc._$TXN:
	.cfi_startproc
#ifdef __LP64__
	jmp _ITM_malloc
#else
	/* NOTE: unfortunately icc-32bit doesn't recognized it as regparm(2)
	   TODO: a way to defined it?
	*/
	movl 	4(%esp), %eax  /* Set 1st param */
	call 	_ITM_malloc
	movl 	%eax, 4(%esp)  /* Set return value */
	ret
#endif
	.cfi_endproc
	.size	malloc._$TXN, .-malloc._$TXN
	.size	malloc._$WrapTXN, .-malloc._$WrapTXN


	.p2align 4
	.globl	free._$WrapTXN
	.type	free._$WrapTXN, @function
	.hidden	free._$WrapTXN
	.globl	free._$TXN
	.type	free._$TXN, @function
	.hidden	free._$TXN

free._$WrapTXN:
free._$TXN:
	.cfi_startproc
#ifdef __LP64__
	jmp _ITM_free
#else
	/* NOTE: unfortunately icc-32bit doesn't recognized it as regparm(2)
	   TODO: a way to defined it?
	*/
	movl 	4(%esp), %eax  /* Set 1st param */
	call 	_ITM_free
	movl 	%eax, 4(%esp)  /* Set return value */
	ret
#endif
	.cfi_endproc
	.size	free._$WrapTXN, .-free._$WrapTXN
	.size	free._$TXN, .-free._$TXN

#endif

.section .note.GNU-stack, "", @progbits


================================================
FILE: stms/tinystm/abi/dtmc/Makefile
================================================
# Path to tinySTM
ROOT = ../..

.PHONY:	dtmc all clean check

all: 	dtmc

# ROOT must be defined to include Makefile.common
include $(ROOT)/abi/Makefile.common

##############################################################################
## DTMC
##############################################################################
LLVMCC ?= llvm-gcc
LLVMLD ?= llvm-ld
LLC    ?= llc

CPPFLAGS += -DTM_DTMC -I. 

# TODO find a way to avoid compiling not
#LLVMCC_OK = $(shell $(LLVMCC) ; if [ $$? -neq 0 ] ; then echo "no differences" ; fi)
#@$(echo) "TEST"
#if [ $$? -eq 0 ] ; then echo "no differences" ; fi
#$(shell echo $(LLVMCC_OK))

# -DNO_STACK_CHECK => shouldn't be used
# TODO Add possibility to create 32 bits lib => llvm-ld needs -m32?
# TODO find a good way to add dependencies abi.c tanger.c arch.S and also the lib (otherwise it will not be built or updated)
#%.bc:   ../%.c
#	$(LLVMCC) $(CPPFLAGS) $(CFLAGS) $(DEF_ABI) -emit-llvm -c -o $@ $<

dtmc: 	dtmc-bc dtmc-static

libtanger-stm.bc: 	../abi.c tanger.c tanger.h
	$(LLVMCC) $(CPPFLAGS) $(CFLAGS) $(DEF_ABI) -emit-llvm -c -o abi.bc ../abi.c
	$(LLVMLD) -link-as-library -o $@ abi.bc

arch.o: 	arch.S
	$(CC) $(CPPFLAGS) $(CFLAGS) -o $@ -c $<

dtmc-static: 	libtanger-stm.bc arch.o 
	$(LLC) libtanger-stm.bc -o libtanger-stm.s
	$(CC) $(CPPFLAGS) $(CFLAGS) -c libtanger-stm.s -o libtanger-stm.o
	$(AR) cru libitm.a libtanger-stm.o arch.o

dtmc-bc: 	libtanger-stm.bc arch.o
	$(AR) cru libtanger-stm.a arch.o

##############################################################################

TANGERDIR       ?= YOU_DID_NOT_SET_TANGERDIR
LLVMBUILD       ?= $(shell llvm-config --build-mode)
ifeq ($(shell uname),Darwin)
  LIBTYPE       ?= dylib
else
  LIBTYPE       ?= so
endif
#STMLIBDIR       ?= $(TANGERDIR)/$(LLVMBUILD)/lib/
STMLIBDIR       ?= .
STMSUPPORTDIR   ?= $(TANGERDIR)/$(LLVMBUILD)/lib/
TANGERPASS      ?= $(TANGERDIR)/$(LLVMBUILD)/lib/libtanger.$(LIBTYPE)
TMLINK          ?= $(TANGERDIR)/$(LLVMBUILD)/bin/tmlink
TMLINKFLAGS     ?= -stmsupport=$(STMSUPPORTDIR) -stmlib=$(STMLIBDIR)
LLVMCCFLAGS     ?= --gnu-tm -emit-llvm

TESTCC          ?= $(LLVMCC)
TESTCFLAGS      += $(LLVMCCFLAGS) -DTM_DTMC

TESTLD          ?= $(TMLINK)
TESTLDFLAGS     := $(TMLINKFLAGS) $(TESTLDFLAGS) 
# TESTLDFLAGS     += -static 

clean: 	intset-clean
	rm -f *.bc *.o libtanger-stm.bc libtanger-stm.a libitm.a libtanger-stm.s


================================================
FILE: stms/tinystm/abi/dtmc/arch.S
================================================
#ifndef __linux__
/* Linux specific, Windows uses a different calling convention. */
/* TODO probably solaris is fine with it. */
# error This file is linux specific.
#endif

	.text

#if defined(__x86_64__)
	.align 4
	.globl	_ITM_beginTransaction
	.type	_ITM_beginTransaction, @function

_ITM_beginTransaction:
	.cfi_startproc
/* Paramters (in order) is in rdi, rsi, rdx, rcx, r8, r9 */
/* Temporary registers is r10, r11 (not saved) */
/* To be saved are rbx, rsp, rbp, r12, r13, r14, r15 */
	leaq	8(%rsp), %rax    /* Save stack pointer */
	subq	$56, %rsp
	.cfi_def_cfa_offset 64
	movq	%rax, (%rsp)
	movq	%rbx, 8(%rsp)
	movq	%rbp, 16(%rsp)
	movq	%r12, 24(%rsp)
	movq	%r13, 32(%rsp)
	movq	%r14, 40(%rsp)
	movq	%r15, 48(%rsp)
	movq	%rsp, %rsi
	call	GTM_begin_transaction
	addq	$56, %rsp
	.cfi_def_cfa_offset 8
	ret
	.cfi_endproc
	.size	_ITM_beginTransaction, .-_ITM_beginTransaction

	.align 4
	.globl	_ITM_siglongjmp
	.type	_ITM_siglongjmp, @function
	.hidden	_ITM_siglongjmp

_ITM_siglongjmp:
	.cfi_startproc
	movq	(%rsi), %rcx
	movq	8(%rsi), %rbx
	movq	16(%rsi), %rbp
	movq	24(%rsi), %r12
	movq	32(%rsi), %r13
	movq	40(%rsi), %r14
	movq	48(%rsi), %r15
	movl	%edi, %eax
	andq    $0x03, %rax 			/* DTMC uses highest bits for multiple code path */
	.cfi_def_cfa %rsi, 0
	.cfi_offset %rip, 56
	.cfi_register %rsp, %rcx
	movq	%rcx, %rsp
	jmp	*56(%rsi)
	.cfi_endproc
	.size	_ITM_siglongjmp, .-_ITM_siglongjmp

#elif defined(__i386__)
	.align 4
	.globl	_ITM_beginTransaction
	.type	_ITM_beginTransaction, @function
_ITM_beginTransaction:
	.cfi_startproc
	leal	4(%esp), %ecx
	movl	4(%esp), %eax
	subl	$28, %esp
	.cfi_def_cfa_offset 32
	movl	%ecx, 8(%esp)
	movl	%ebx, 12(%esp)
	movl	%esi, 16(%esp)
	movl	%edi, 20(%esp)
	movl	%ebp, 24(%esp)
	leal	8(%esp), %edx
	call	GTM_begin_transaction
	addl	$28, %esp
	.cfi_def_cfa_offset 4
	ret
	.cfi_endproc
	.size	_ITM_beginTransaction, .-_ITM_beginTransaction


	.align 4
	.globl	_ITM_siglongjmp
	.type	_ITM_siglongjmp, @function
	.hidden	_ITM_siglongjmp
_ITM_siglongjmp:
	.cfi_startproc
	movl	(%edx), %ecx
	movl	4(%edx), %ebx
	movl	8(%edx), %esi
	movl	12(%edx), %edi
	movl	16(%edx), %ebp
	andl    $0x03, %eax 			/* DTMC uses highest bits for multiple code path */
	.cfi_def_cfa %edx, 0
	.cfi_offset %eip, 20
	.cfi_register %esp, %ecx
	movl	%ecx, %esp
	jmp	*20(%edx)
	.cfi_endproc
	.size	_ITM_siglongjmp, .-_ITM_siglongjmp

#elif defined(__sparc__)
# error SPARC arch is not yet supported
#else
# error Unsupported architecture
#endif

#ifdef __linux__
.section .note.GNU-stack, "", @progbits
#endif


================================================
FILE: stms/tinystm/abi/dtmc/libitm.h
================================================
/*
 * File:
 *   libitm.h
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * Description:
 *   ABI for tinySTM.
 *
 * Copyright (c) 2007-2014.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * This program has a dual license and can also be distributed
 * under the terms of the MIT license.
 */

#ifndef _LIBITM_H_
#define _LIBITM_H_

#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */

#include <stdint.h>
#include <stdbool.h>
#ifdef __SSE__
# include <xmmintrin.h>
#endif

/* ################################################################### *
 * DEFINES
 * ################################################################### */
#define _ITM_VERSION_NO_STR "1.0.4"
#define _ITM_VERSION_NO 104

#if defined(__i386__)
# define _ITM_CALL_CONVENTION __attribute__((regparm(2)))
#else
# define _ITM_CALL_CONVENTION
#endif

#define _ITM_noTransactionId 1		/* Id for non-transactional code. */


#define _ITM_TRANSACTION_PURE __attribute__((transaction_pure))

/* ################################################################### *
 * TYPES
 * ################################################################### */

typedef void *_ITM_transaction;

typedef void (*_ITM_userUndoFunction)(void *);
typedef void (*_ITM_userCommitFunction)(void *);

typedef uint32_t _ITM_transactionId;

typedef enum
{
  outsideTransaction = 0,
  inRetryableTransaction,
  inIrrevocableTransaction
} _ITM_howExecuting;

struct _ITM_srcLocationS
{
  int32_t reserved_1;
  int32_t flags;
  int32_t reserved_2;
  int32_t reserved_3;
  const char *psource;
};

typedef struct _ITM_srcLocationS _ITM_srcLocation;

typedef enum {
  pr_instrumentedCode = 0x0001,
  pr_uninstrumentedCode = 0x0002,
  pr_multiwayCode = pr_instrumentedCode | pr_uninstrumentedCode,
  pr_hasNoXMMUpdate = 0x0004,
  pr_hasNoAbort = 0x0008,
  pr_hasNoRetry = 0x0010,
  pr_hasNoIrrevocable = 0x0020,
  pr_doesGoIrrevocable = 0x0040,
  pr_hasNoSimpleReads = 0x0080,
  pr_aWBarriersOmitted = 0x0100,
  pr_RaRBarriersOmitted = 0x0200,
  pr_undoLogCode = 0x0400,
  pr_preferUninstrumented = 0x0800,
  pr_exceptionBlock = 0x1000,
  pr_hasElse = 0x2000,
  pr_readOnly = 0x4000 /* GNU gcc specific */
} _ITM_codeProperties;

typedef enum {
  a_runInstrumentedCode = 0x01,
  a_runUninstrumentedCode = 0x02,
  a_saveLiveVariables = 0x04,
  a_restoreLiveVariables = 0x08,
  a_abortTransaction = 0x10,
} _ITM_actions;

typedef enum {
  modeSerialIrrevocable,
  modeObstinate,
  modeOptimistic,
  modePessimistic,
} _ITM_transactionState;

typedef enum {
  unknown = 0,
  userAbort = 1,
  userRetry = 2,
  TMConflict= 4,
  exceptionBlockAbort = 8
} _ITM_abortReason;


/* ################################################################### *
 * FUNCTIONS
 * ################################################################### */

extern _ITM_TRANSACTION_PURE
_ITM_transaction * _ITM_CALL_CONVENTION _ITM_getTransaction(void);

extern _ITM_TRANSACTION_PURE
_ITM_howExecuting _ITM_CALL_CONVENTION _ITM_inTransaction();

extern _ITM_TRANSACTION_PURE
int _ITM_CALL_CONVENTION _ITM_getThreadnum(void);

extern _ITM_TRANSACTION_PURE
void _ITM_CALL_CONVENTION _ITM_addUserCommitAction( 
                             _ITM_userCommitFunction __commit,
                             _ITM_transactionId resumingTransactionId,
                             void *__arg);

extern _ITM_TRANSACTION_PURE
void _ITM_CALL_CONVENTION _ITM_addUserUndoAction( 
                             const _ITM_userUndoFunction __undo, void * __arg);

extern _ITM_TRANSACTION_PURE
_ITM_transactionId _ITM_CALL_CONVENTION _ITM_getTransactionId();

extern _ITM_TRANSACTION_PURE
void _ITM_CALL_CONVENTION _ITM_dropReferences( 
                             const void *__start, size_t __size);

extern _ITM_TRANSACTION_PURE
void _ITM_CALL_CONVENTION _ITM_userError(const char *errString, int exitCode);

extern const char * _ITM_CALL_CONVENTION _ITM_libraryVersion(void);

extern int _ITM_CALL_CONVENTION _ITM_versionCompatible(int version);


extern int _ITM_CALL_CONVENTION _ITM_initializeThread(void);

extern void _ITM_CALL_CONVENTION _ITM_finalizeThread(void);

extern void _ITM_CALL_CONVENTION _ITM_finalizeProcess(void);

extern int _ITM_CALL_CONVENTION _ITM_initializeProcess(void);

extern void _ITM_CALL_CONVENTION _ITM_error(const _ITM_srcLocation *__src,
                             int errorCode);

extern uint32_t _ITM_CALL_CONVENTION _ITM_beginTransaction( 
                             uint32_t __properties,
                             const _ITM_srcLocation *__src)
                             __attribute__((returns_twice));

extern void _ITM_CALL_CONVENTION _ITM_commitTransaction( 
                             const _ITM_srcLocation *__src);


extern bool _ITM_CALL_CONVENTION _ITM_tryCommitTransaction( 
                             const _ITM_srcLocation *__src);

extern void _ITM_CALL_CONVENTION _ITM_commitTransactionToId( 
                             const _ITM_transactionId tid,
                             const _ITM_srcLocation *__src);

extern void _ITM_CALL_CONVENTION _ITM_abortTransaction( 
                             _ITM_abortReason __reason,
                             const _ITM_srcLocation *__src);

extern void _ITM_CALL_CONVENTION _ITM_rollbackTransaction( 
                             const _ITM_srcLocation *__src);

extern void _ITM_CALL_CONVENTION _ITM_registerThrownObject( 
                             const void *__obj,
                             size_t __size);

extern void _ITM_CALL_CONVENTION _ITM_changeTransactionMode( 
                             _ITM_transactionState __mode,
                             const _ITM_srcLocation *__loc);


extern void * _ITM_malloc(size_t);
extern void * _ITM_calloc(size_t, size_t);
extern void _ITM_free(void *);

/*** Loads ***/

extern uint8_t _ITM_CALL_CONVENTION _ITM_RU1( const uint8_t *);
extern uint8_t _ITM_CALL_CONVENTION _ITM_RaRU1( const uint8_t *);
extern uint8_t _ITM_CALL_CONVENTION _ITM_RaWU1( const uint8_t *);
extern uint8_t _ITM_CALL_CONVENTION _ITM_RfWU1( const uint8_t *);

extern uint16_t _ITM_CALL_CONVENTION _ITM_RU2( const uint16_t *);
extern uint16_t _ITM_CALL_CONVENTION _ITM_RaRU2( const uint16_t *);
extern uint16_t _ITM_CALL_CONVENTION _ITM_RaWU2( const uint16_t *);
extern uint16_t _ITM_CALL_CONVENTION _ITM_RfWU2( const uint16_t *);

extern uint32_t _ITM_CALL_CONVENTION _ITM_RU4( const uint32_t *);
extern uint32_t _ITM_CALL_CONVENTION _ITM_RaRU4( const uint32_t *);
extern uint32_t _ITM_CALL_CONVENTION _ITM_RaWU4( const uint32_t *);
extern uint32_t _ITM_CALL_CONVENTION _ITM_RfWU4( const uint32_t *);

extern uint64_t _ITM_CALL_CONVENTION _ITM_RU8( const uint64_t *);
extern uint64_t _ITM_CALL_CONVENTION _ITM_RaRU8( const uint64_t *);
extern uint64_t _ITM_CALL_CONVENTION _ITM_RaWU8( const uint64_t *);
extern uint64_t _ITM_CALL_CONVENTION _ITM_RfWU8( const uint64_t *);

extern float _ITM_CALL_CONVENTION _ITM_RF( const float *);
extern float _ITM_CALL_CONVENTION _ITM_RaRF( const float *);
extern float _ITM_CALL_CONVENTION _ITM_RaWF( const float *);
extern float _ITM_CALL_CONVENTION _ITM_RfWF( const float *);

extern double _ITM_CALL_CONVENTION _ITM_RD( const double *);
extern double _ITM_CALL_CONVENTION _ITM_RaRD( const double *);
extern double _ITM_CALL_CONVENTION _ITM_RaWD( const double *);
extern double _ITM_CALL_CONVENTION _ITM_RfWD( const double *);

#ifdef __SSE__
extern __m64 _ITM_CALL_CONVENTION _ITM_RM64( const __m64 *);
extern __m64 _ITM_CALL_CONVENTION _ITM_RaRM64( const __m64 *);
extern __m64 _ITM_CALL_CONVENTION _ITM_RaWM64( const __m64 *);
extern __m64 _ITM_CALL_CONVENTION _ITM_RfWM64( const __m64 *);

extern __m128 _ITM_CALL_CONVENTION _ITM_RM128( const __m128 *);
extern __m128 _ITM_CALL_CONVENTION _ITM_RaRM128( const __m128 *);
extern __m128 _ITM_CALL_CONVENTION _ITM_RaWM128( const __m128 *);
extern __m128 _ITM_CALL_CONVENTION _ITM_RfWM128( const __m128 *);
#endif /* __SSE__ */

extern float _Complex _ITM_CALL_CONVENTION _ITM_RCF( const float _Complex *);
extern float _Complex _ITM_CALL_CONVENTION _ITM_RaRCF( const float _Complex *);
extern float _Complex _ITM_CALL_CONVENTION _ITM_RaWCF( const float _Complex *);
extern float _Complex _ITM_CALL_CONVENTION _ITM_RfWCF( const float _Complex *);

extern double _Complex _ITM_CALL_CONVENTION _ITM_RCD( const double _Complex *);
extern double _Complex _ITM_CALL_CONVENTION _ITM_RaRCD( const double _Complex *);
extern double _Complex _ITM_CALL_CONVENTION _ITM_RaWCD( const double _Complex *);
extern double _Complex _ITM_CALL_CONVENTION _ITM_RfWCD( const double _Complex *);

extern long double _Complex _ITM_CALL_CONVENTION _ITM_RCE( const long double _Complex *);
extern long double _Complex _ITM_CALL_CONVENTION _ITM_RaRCE( const long double _Complex *);
extern long double _Complex _ITM_CALL_CONVENTION _ITM_RaWCE( const long double _Complex *);
extern long double _Complex _ITM_CALL_CONVENTION _ITM_RfWCE( const long double _Complex *);


/*** Stores ***/

extern void _ITM_CALL_CONVENTION _ITM_WU1( const uint8_t *, uint8_t);
extern void _ITM_CALL_CONVENTION _ITM_WaRU1( const uint8_t *, uint8_t);
extern void _ITM_CALL_CONVENTION _ITM_WaWU1( const uint8_t *, uint8_t);

extern void _ITM_CALL_CONVENTION _ITM_WU2( const uint16_t *, uint16_t);
extern void _ITM_CALL_CONVENTION _ITM_WaRU2( const uint16_t *, uint16_t);
extern void _ITM_CALL_CONVENTION _ITM_WaWU2( const uint16_t *, uint16_t);

extern void _ITM_CALL_CONVENTION _ITM_WU4( const uint32_t *, uint32_t);
extern void _ITM_CALL_CONVENTION _ITM_WaRU4( const uint32_t *, uint32_t);
extern void _ITM_CALL_CONVENTION _ITM_WaWU4( const uint32_t *, uint32_t);

extern void _ITM_CALL_CONVENTION _ITM_WU8( const uint64_t *, uint64_t);
extern void _ITM_CALL_CONVENTION _ITM_WaRU8( const uint64_t *, uint64_t);
extern void _ITM_CALL_CONVENTION _ITM_WaWU8( const uint64_t *, uint64_t);

extern void _ITM_CALL_CONVENTION _ITM_WF( const float *, float);
extern void _ITM_CALL_CONVENTION _ITM_WaRF( const float *, float);
extern void _ITM_CALL_CONVENTION _ITM_WaWF( const float *, float);

extern void _ITM_CALL_CONVENTION _ITM_WD( const double *, double);
extern void _ITM_CALL_CONVENTION _ITM_WaRD( const double *, double);
extern void _ITM_CALL_CONVENTION _ITM_WaWD( const double *, double);

#ifdef __SSE__
extern void _ITM_CALL_CONVENTION _ITM_WM64( const __m64 *, __m64);
extern void _ITM_CALL_CONVENTION _ITM_WaRM64( const __m64 *, __m64);
extern void _ITM_CALL_CONVENTION _ITM_WaWM64( const __m64 *, __m64);

extern void _ITM_CALL_CONVENTION _ITM_WM128( const __m128 *, __m128);
extern void _ITM_CALL_CONVENTION _ITM_WaRM128( const __m128 *, __m128);
extern void _ITM_CALL_CONVENTION _ITM_WaWM128( const __m128 *, __m128);
#endif /* __SSE__ */

extern void _ITM_CALL_CONVENTION _ITM_WCF( const float _Complex *, float _Complex);
extern void _ITM_CALL_CONVENTION _ITM_WaRCF( const float _Complex *, float _Complex);
extern void _ITM_CALL_CONVENTION _ITM_WaWCF( const float _Complex *, float _Complex);

extern void _ITM_CALL_CONVENTION _ITM_WCD( const double _Complex *, double _Complex);
extern void _ITM_CALL_CONVENTION _ITM_WaRCD( const double _Complex *, double _Complex);
extern void _ITM_CALL_CONVENTION _ITM_WaWCD( const double _Complex *, double _Complex);

extern void _ITM_CALL_CONVENTION _ITM_WCE( const long double _Complex *, long double _Complex);
extern void _ITM_CALL_CONVENTION _ITM_WaRCE( const long double _Complex *, long double _Complex);
extern void _ITM_CALL_CONVENTION _ITM_WaWCE( const long double _Complex *, long double _Complex);


/*** Logging functions ***/

extern void _ITM_CALL_CONVENTION _ITM_LU1( const uint8_t *);
extern void _ITM_CALL_CONVENTION _ITM_LU2( const uint16_t *);
extern void _ITM_CALL_CONVENTION _ITM_LU4( const uint32_t *);
extern void _ITM_CALL_CONVENTION _ITM_LU8( const uint64_t *);
extern void _ITM_CALL_CONVENTION _ITM_LF( const float *);
extern void _ITM_CALL_CONVENTION _ITM_LD( const double *);
extern void _ITM_CALL_CONVENTION _ITM_LE( const long double *);
extern void _ITM_CALL_CONVENTION _ITM_LM64( const __m64 *);
extern void _ITM_CALL_CONVENTION _ITM_LM128( const __m128 *);
extern void _ITM_CALL_CONVENTION _ITM_LCF( const float _Complex *);
extern void _ITM_CALL_CONVENTION _ITM_LCD( const double _Complex *);
extern void _ITM_CALL_CONVENTION _ITM_LCE( const long double _Complex *);
extern void _ITM_CALL_CONVENTION _ITM_LB( const void *, size_t);


/*** memcpy functions ***/

extern void _ITM_CALL_CONVENTION _ITM_memcpyRnWt( void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memcpyRnWtaR( void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memcpyRnWtaW( void *, const void *, size_t);

extern void _ITM_CALL_CONVENTION _ITM_memcpyRtWn( void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memcpyRtaRWn( void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memcpyRtaWWn( void *, const void *, size_t);

extern void _ITM_CALL_CONVENTION _ITM_memcpyRtWt( void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memcpyRtWtaR( void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memcpyRtWtaW( void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memcpyRtaRWt( void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memcpyRtaRWtaR( void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memcpyRtaRWtaW( void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memcpyRtaWWt( void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memcpyRtaWWtaR( void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memcpyRtaWWtaW( void *, const void *, size_t);


/*** memset functions ***/

extern void _ITM_CALL_CONVENTION _ITM_memsetW( void *, int, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memsetWaR( void *, int, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memsetWaW( void *, int, size_t);


/*** memmove functions ***/

extern void _ITM_CALL_CONVENTION _ITM_memmoveRnWt( void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memmoveRnWtaR( void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memmoveRnWtaW( void *, const void *, size_t);

extern void _ITM_CALL_CONVENTION _ITM_memmoveRtWn( void *, const void *, size_t); 
extern void _ITM_CALL_CONVENTION _ITM_memmoveRtaRWn( void *, const void *, size_t); 
extern void _ITM_CALL_CONVENTION _ITM_memmoveRtaWWn( void *, const void *, size_t); 

extern void _ITM_CALL_CONVENTION _ITM_memmoveRtWt( void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memmoveRtWtaR( void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memmoveRtWtaW( void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memmoveRtaRWt( void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memmoveRtaRWtaR( void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memmoveRtaRWtaW( void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memmoveRtaWWt( void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memmoveRtaWWtaR( void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memmoveRtaWWtaW( void *, const void *, size_t);


#ifdef __cplusplus
} /* extern "C" */
#endif /* __cplusplus */

#endif /* _LIBITM_H_ */


================================================
FILE: stms/tinystm/abi/dtmc/libtanger-stm.public-symbols
================================================
main
GTM_begin_transaction


================================================
FILE: stms/tinystm/abi/dtmc/libtanger-stm.support
================================================
,1


================================================
FILE: stms/tinystm/abi/dtmc/tanger-stm-internal.h
================================================
/* Copyright (C) 2007-2009  Torvald Riegel
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */
/**
 * Internal STM interface. Contains the interface that an STM has to implement
 * for Tanger. The part of the interface that is visible in the application is
 * in tanger-stm.h.
 */
#ifndef TANGERSTMINTERNAL_H_
#define TANGERSTMINTERNAL_H_

#include <stdint.h>
#include <stddef.h>
#include <stdbool.h>

#ifdef __cplusplus
extern "C" {
#endif

/** A transaction descriptor/handle/... */
typedef void tanger_stm_tx_t;

#ifndef TANGER_LOADSTORE_ATTR
/* FIXME: LLVM seems to have a bug when inlining these functions. Check if fixed with newer version. */
//#define TANGER_LOADSTORE_ATTR __attribute__((nothrow,always_inline))
#define TANGER_LOADSTORE_ATTR __attribute__((nothrow,noinline))
#endif

#if defined(__i386__)
/* XXX This is not supported by LLVM yet. */
#define ITM_REGPARM __attribute__((regparm(2)))
#else
#define ITM_REGPARM
#endif

/* Load and store functions access a certain number of bits.
 * 1b loads/stores are currently assumed to actually go to full 8 bits.
 * Addresses being accessed are not necessarily aligned (e.g., a 16b load
 * might target memory address 1). If it is known at compile time that the
 * access is aligned, then different functions are called.
 */
uint8_t tanger_stm_load1(tanger_stm_tx_t* tx, uint8_t *addr) TANGER_LOADSTORE_ATTR;
uint8_t tanger_stm_load8(tanger_stm_tx_t* tx, uint8_t *addr) TANGER_LOADSTORE_ATTR;
uint16_t tanger_stm_load16(tanger_stm_tx_t* tx, uint16_t *addr) TANGER_LOADSTORE_ATTR;
uint32_t tanger_stm_load32(tanger_stm_tx_t* tx, uint32_t *addr) TANGER_LOADSTORE_ATTR;
uint64_t tanger_stm_load64(tanger_stm_tx_t* tx, uint64_t *addr) TANGER_LOADSTORE_ATTR;
uint16_t tanger_stm_load16aligned(tanger_stm_tx_t* tx, uint16_t *addr) TANGER_LOADSTORE_ATTR;
uint32_t tanger_stm_load32aligned(tanger_stm_tx_t* tx, uint32_t *addr) TANGER_LOADSTORE_ATTR;
uint64_t tanger_stm_load64aligned(tanger_stm_tx_t* tx, uint64_t *addr) TANGER_LOADSTORE_ATTR;

/** Loads a number of bytes from src and copies them to dest
 * src is shared data, dest must point to thread-private data */
void tanger_stm_loadregion(tanger_stm_tx_t* tx, uint8_t *src, uintptr_t bytes, uint8_t *dest);
/** Starts reading a number of bytes from addr.
 * Mostly useful for creating wrappers for library functions. Use with care!
 * The function returns an address that you can read the data from. Depending
 * on the STM algorithm, it might be different from addr or not.
 * You must call tanger_stm_loadregionpost after calling this function.
 * You must not call any other STM function between the pre and post calls.
 * Data read from addr between the two calls is not guaranteed to be a
 * consistent snapshot. */
void* tanger_stm_loadregionpre(tanger_stm_tx_t* tx, uint8_t *addr, uintptr_t bytes);
/** See tanger_stm_loadregionpre */
void tanger_stm_loadregionpost(tanger_stm_tx_t* tx, uint8_t *addr, uintptr_t bytes);
/** This is like tanger_stm_loadregionpre() except that it assumes that addr
 * points to a zero-terminated string and thus reads all bytes up to and
 * including the final zero. It returns the size of the string (including the
 * terminating zero) in bytes. The size is guaranteed to be derived from a
 * consistent snapshot. If you read the strings' contents, you must call
 * tanger_stm_loadregionpost() afterwards.
 */
void* tanger_stm_loadregionstring(tanger_stm_tx_t* tx, char *addr, uintptr_t *bytes);

void tanger_stm_store1(tanger_stm_tx_t* tx, uint8_t *addr, uint8_t value) TANGER_LOADSTORE_ATTR;
void tanger_stm_store8(tanger_stm_tx_t* tx, uint8_t *addr, uint8_t value) TANGER_LOADSTORE_ATTR;
void tanger_stm_store16(tanger_stm_tx_t* tx, uint16_t *addr, uint16_t value) TANGER_LOADSTORE_ATTR;
void tanger_stm_store32(tanger_stm_tx_t* tx, uint32_t *addr, uint32_t value) TANGER_LOADSTORE_ATTR;
void tanger_stm_store64(tanger_stm_tx_t* tx, uint64_t *addr, uint64_t value) TANGER_LOADSTORE_ATTR;
void tanger_stm_store16aligned(tanger_stm_tx_t* tx, uint16_t *addr, uint16_t value) TANGER_LOADSTORE_ATTR;
void tanger_stm_store32aligned(tanger_stm_tx_t* tx, uint32_t *addr, uint32_t value) TANGER_LOADSTORE_ATTR;
void tanger_stm_store64aligned(tanger_stm_tx_t* tx, uint64_t *addr, uint64_t value) TANGER_LOADSTORE_ATTR;
/** Reads a number of bytes from src and writes them to dest
 * dest is shared data, src must point to thread-private data */
void tanger_stm_storeregion(tanger_stm_tx_t* tx, uint8_t *src, uintptr_t bytes, uint8_t *dest);
/** Prepares writing a number of bytes to addr.
 * The function returns an address that you can write the data to. Depending
 * on the STM algorithm, it might be different from addr or not.
 * The memory starting at addr does not necessarily contain a consistent
 * snapshot or the data previously located at this memory region. */
void* tanger_stm_storeregionpre(tanger_stm_tx_t* tx, uint8_t *addr, uintptr_t bytes);
/** Prepares updating a number of bytes starting at addr.
 * The function returns an address that you can write the data to. Depending
 * on the STM algorithm, it might be different from addr or not.
 * The memory starting at the returned address will contain a consistent
 * snapshot of the previous values of the region. */
void* tanger_stm_updateregionpre(tanger_stm_tx_t* tx, uint8_t *addr, uintptr_t bytes);


/**
 * Returns the calling thread's transaction descriptor.
 * ABI note: Remove this once we have efficient TLS.
 */
tanger_stm_tx_t* tanger_stm_get_tx(void);

/**
 * Saves or restores the stack, depending on whether the current txn was
 * started or restarted. The STM will save/restore everything in the range
 * [low_addr, high_addr). The STM's implementation of this function must be
 * marked as no-inline, so it will get a new stack frame that does not
 * overlap the [low_addr, high_addr) region.
 * To avoid corrupting stack space of rollback functions, the STM should skip
 * undoing changes to addresses that are between the current stack pointer
 * during execution of the undo function and the [low_addr, high_addr)
 * area (i.e., all newer stack frames, including the current one).
 */
void tanger_stm_save_restore_stack(void* low_addr, void* high_addr);

/**
 * Replacement function for malloc calls in transactions.
 */
void *tanger_stm_malloc(size_t size);

/**
 * Replacement function for free calls in transactions.
 */
void tanger_stm_free(void *ptr);

/**
 * Replacement function for calloc calls in transactions.
 */
void *tanger_stm_calloc(size_t nmemb, size_t size);

/**
 * Replacement function for realloc calls in transactions.
 */
void *tanger_stm_realloc(void *ptr, size_t size);

/**
 * Returns the transactional version of the function passed as argument.
 * If no transactional version has been registered, it aborts.
 */
void* tanger_stm_indirect_resolve(void *nontxnal_function);

/**
 * Called before transactional versions are registered for nontransactional
 * functions.
 * The parameter returns the exact number of functions that will be registered.
 */
void tanger_stm_indirect_init(uint32_t number_of_call_targets);

/**
 * Registers a transactional versions for a nontransactional function.
 */
void tanger_stm_indirect_register(void* nontxnal, void* txnal);

/* ABI: Additionnal declarations */
void tanger_stm_stack_restorehack();
void tanger_stm_stack_savehack();
//void tanger_stm_threadstack_init();
//void tanger_stm_threadstack_fini();

#ifdef __cplusplus
}
#endif

#endif /*TANGERSTMINTERNAL_H_*/


================================================
FILE: stms/tinystm/abi/dtmc/tanger.c
================================================
/*
 * File:
 *   tanger.c
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * Description:
 *   Tanger adapter for tinySTM.
 *
 * Copyright (c) 2007-2014.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * This program has a dual license and can also be distributed
 * under the terms of the MIT license.
 */

/* This file is designed to work with DTMC (Tanger/LLVM).
 * DTMC is not 100% compatible with Intel ABI yet thus this file 
 * permits to propose a workaround.
 */

#define _GNU_SOURCE
#include <assert.h>
#include <string.h>
#include <pthread.h>
#include <bits/wordsize.h>

/* A transaction descriptor/handle/... */
//typedef void tanger_stm_tx_t;

#ifndef TANGER_LOADSTORE_ATTR
/* FIXME: LLVM seems to have a bug when inlining these functions. Check if fixed with newer version. */
//# define TANGER_LOADSTORE_ATTR __attribute__((nothrow,always_inline))
# define TANGER_LOADSTORE_ATTR __attribute__((nothrow,noinline))
#endif /* TANGER_LOADSTORE_ATTR */

#define TM_LOAD    stm_load
#define TM_STORE   stm_store

/* TODO manage properly TLS but llvm-gcc should do */
__thread appstack_t appstack;

/* ################################################################### *
 * TANGER FUNCTIONS
 * ################################################################### */

#ifdef EXPLICIT_TX_PARAMETER
# define TX_PARAM (struct stm_tx *)tx,
#else
# define TX_PARAM
#endif

TANGER_LOADSTORE_ATTR
uint8_t tanger_stm_load1(tanger_stm_tx_t *tx, uint8_t *addr)
{
#ifdef STACK_CHECK
  /* TODO add unlikely */
  if (on_stack(addr))
    return *addr;
#endif /* STACK_CHECK */
  return stm_load_u8(TX_PARAM addr);
}

TANGER_LOADSTORE_ATTR
uint8_t tanger_stm_load8(tanger_stm_tx_t *tx, uint8_t *addr)
{
#ifdef STACK_CHECK
  if (on_stack(addr))
    return *addr;
#endif /* STACK_CHECK */
  return stm_load_u8(TX_PARAM addr);
}

TANGER_LOADSTORE_ATTR
uint16_t tanger_stm_load16(tanger_stm_tx_t *tx, uint16_t *addr)
{
#ifdef STACK_CHECK
  if (on_stack(addr))
    return *addr;
#endif /* STACK_CHECK */
  return stm_load_u16(TX_PARAM addr);
}

TANGER_LOADSTORE_ATTR
uint32_t tanger_stm_load32(tanger_stm_tx_t *tx, uint32_t *addr)
{
#ifdef STACK_CHECK
  if (on_stack(addr))
    return *addr;
#endif /* STACK_CHECK */
  return stm_load_u32(TX_PARAM addr);
}

TANGER_LOADSTORE_ATTR
uint64_t tanger_stm_load64(tanger_stm_tx_t *tx, uint64_t *addr)
{
#ifdef STACK_CHECK
  if (on_stack(addr))
    return *addr;
#endif /* STACK_CHECK */
  return stm_load_u64(TX_PARAM addr);
}

TANGER_LOADSTORE_ATTR
uint16_t tanger_stm_load16aligned(tanger_stm_tx_t *tx, uint16_t *addr)
{
#ifdef STACK_CHECK
  if (on_stack(addr))
    return *addr;
#endif /* STACK_CHECK */
  return stm_load_u16(TX_PARAM addr);
}

TANGER_LOADSTORE_ATTR
uint32_t tanger_stm_load32aligned(tanger_stm_tx_t *tx, uint32_t *addr)
{
#ifdef STACK_CHECK
  if (on_stack(addr))
    return *addr;
#endif /* STACK_CHECK */
#if __WORDSIZE == 32
  return (uint32_t)TM_LOAD(TX_PARAM (volatile stm_word_t *)addr);
#else
  return stm_load_u32(TX_PARAM addr);
#endif
}

TANGER_LOADSTORE_ATTR
uint64_t tanger_stm_load64aligned(tanger_stm_tx_t *tx, uint64_t *addr)
{
#ifdef STACK_CHECK
  if (on_stack(addr))
    return *addr;
#endif /* STACK_CHECK */
#if __WORDSIZE == 64
  return (uint64_t)TM_LOAD(TX_PARAM (volatile stm_word_t *)addr);
#else 
  return stm_load_u64(TX_PARAM addr);
#endif 
}

TANGER_LOADSTORE_ATTR
void tanger_stm_loadregion(tanger_stm_tx_t* tx, uint8_t *src, uintptr_t bytes, uint8_t *dest)
{
#ifdef STACK_CHECK
  if (on_stack(src))
    memcpy(dest, src, bytes);
#endif /* STACK_CHECK */
  stm_load_bytes(TX_PARAM src, dest, bytes);
}

TANGER_LOADSTORE_ATTR
void* tanger_stm_loadregionpre(tanger_stm_tx_t* tx, uint8_t *addr, uintptr_t bytes)
{
  fprintf(stderr, "%s: not yet implemented\n", __func__);
  return NULL;
}

TANGER_LOADSTORE_ATTR
void tanger_stm_loadregionpost(tanger_stm_tx_t* tx, uint8_t *addr, uintptr_t bytes)
{
  fprintf(stderr, "%s: not yet implemented\n", __func__);
}

TANGER_LOADSTORE_ATTR
void tanger_stm_store1(tanger_stm_tx_t *tx, uint8_t *addr, uint8_t value)
{
#ifdef STACK_CHECK
  if (on_stack(addr)) {
    *addr = value;
    return;
  }
#endif /* STACK_CHECK */
  stm_store_u8(TX_PARAM addr, value);
}

TANGER_LOADSTORE_ATTR
void tanger_stm_store8(tanger_stm_tx_t *tx, uint8_t *addr, uint8_t value)
{
#ifdef STACK_CHECK
  if (on_stack(addr)) {
    *addr = value;
    return;
  }
#endif /* STACK_CHECK */
  stm_store_u8(TX_PARAM addr, value);
}

TANGER_LOADSTORE_ATTR
void tanger_stm_store16(tanger_stm_tx_t *tx, uint16_t *addr, uint16_t value)
{
#ifdef STACK_CHECK
  if (on_stack(addr)) {
    *addr = value;
    return;
  }
#endif /* STACK_CHECK */
  stm_store_u16(TX_PARAM addr, value);
}

TANGER_LOADSTORE_ATTR
void tanger_stm_store32(tanger_stm_tx_t *tx, uint32_t *addr, uint32_t value)
{
#ifdef STACK_CHECK
  if (on_stack(addr)) {
    *addr = value;
    return;
  }
#endif /* STACK_CHECK */
  stm_store_u32(TX_PARAM addr, value);
}

TANGER_LOADSTORE_ATTR
void tanger_stm_store64(tanger_stm_tx_t *tx, uint64_t *addr, uint64_t value)
{
#ifdef STACK_CHECK
  if (on_stack(addr)) {
    *addr = value;
    return;
  }
#endif /* STACK_CHECK */
  stm_store_u64(TX_PARAM addr, value);
}

TANGER_LOADSTORE_ATTR
void tanger_stm_store16aligned(tanger_stm_tx_t *tx, uint16_t *addr, uint16_t value)
{
#ifdef STACK_CHECK
  if (on_stack(addr)) {
    *addr = value;
    return;
  }
#endif /* STACK_CHECK */
  stm_store_u16(TX_PARAM addr, value);
}

TANGER_LOADSTORE_ATTR
void tanger_stm_store32aligned(tanger_stm_tx_t *tx, uint32_t *addr, uint32_t value)
{
#ifdef STACK_CHECK
  if (on_stack(addr)) {
    *addr = value;
    return;
  }
#endif /* STACK_CHECK */
#if __WORDSIZE == 32
  TM_STORE(TX_PARAM (volatile stm_word_t *)addr, (stm_word_t)value);
#else
  stm_store_u32(TX_PARAM addr, value);
#endif
}

TANGER_LOADSTORE_ATTR
void tanger_stm_store64aligned(tanger_stm_tx_t *tx, uint64_t *addr, uint64_t value)
{
#ifdef STACK_CHECK
  if (on_stack(addr)) {
    *addr = value;
    return;
  }
#endif /* STACK_CHECK */
#if __WORD_SIZE == 64
  TM_STORE(TX_PARAM (volatile stm_word_t *)addr, (stm_word_t)value);
#else
  stm_store_u64(TX_PARAM addr, value);
#endif
}

TANGER_LOADSTORE_ATTR
void tanger_stm_storeregion(tanger_stm_tx_t* tx, uint8_t *src, uintptr_t bytes, uint8_t *dest)
{
#ifdef STACK_CHECK
  if (on_stack(dest)) {
    memcpy(dest, src, bytes);
    return;
  }
#endif /* STACK_CHECK */
  stm_store_bytes(TX_PARAM src, dest, bytes);
}

TANGER_LOADSTORE_ATTR
void* tanger_stm_storeregionpre(tanger_stm_tx_t* tx, uint8_t *addr, uintptr_t bytes)
{
  fprintf(stderr, "%s: not yet implemented\n", __func__);
  return NULL;
}

TANGER_LOADSTORE_ATTR
void* tanger_stm_updateregionpre(tanger_stm_tx_t* tx, uint8_t *addr, uintptr_t bytes)
{
  fprintf(stderr, "%s: not yet implemented\n", __func__);
  return NULL;
}

tanger_stm_tx_t *tanger_stm_get_tx()
{
  struct stm_tx *tx = stm_current_tx();
  if (unlikely(tx == NULL)) {
    /* Thread not initialized: must create transaction */
    _ITM_initializeThread();
    tx = stm_current_tx();
  }

  return (tanger_stm_tx_t *)tx;
}


/* TODO manage nesting */
void tanger_stm_save_restore_stack(void* low_addr, void* high_addr) __attribute__((noinline));
void tanger_stm_save_restore_stack(void* low_addr, void* high_addr)
{
  /* Saving stack info and backup the stack in beginTransaction because LLVM
   *  can add code between this function and beginTransaction. */
  appstack.stack_addr = low_addr;
  appstack.stack_size = (size_t)high_addr - (size_t)low_addr;
}

void tanger_stm_init()
{
  _ITM_initializeProcess();
}

void tanger_stm_shutdown()
{
  _ITM_finalizeProcess();
}

void tanger_stm_thread_init()
{
  _ITM_initializeThread();
}

void tanger_stm_thread_shutdown()
{
  _ITM_finalizeThread();
}

/* TODO check if ok */
//void *tanger_stm_malloc(size_t size, tanger_stm_tx_t* tx)
void *tanger_stm_malloc(size_t size)
{
  return _ITM_malloc(size);
}

void tanger_stm_free(void *ptr)
{
  _ITM_free(ptr);
}

void *tanger_stm_calloc(size_t nmemb, size_t size)
{
  void *p = _ITM_malloc(nmemb * size);
  memset(p, 0, nmemb * size);
  return p;
}

void *tanger_stm_realloc(void *ptr, size_t size)
{
  /* TODO to ITM_imize */
  void *p;
#ifdef EXPLICIT_TX_PARAMETER
  struct stm_tx * tx = stm_current_tx();
#endif /* EXPLICIT_TX_PARAMETER */
  if (ptr == NULL) {
    /* Equivalent to malloc */
    return tanger_stm_malloc(size);
  }
  if (size == 0) {
    /* Equivalent to free */
    tanger_stm_free(ptr);
    return NULL;
  }
  /* Allocate new region */
  p = tanger_stm_malloc(size);
  /* Copy old content to new region */
  stm_load_bytes(TX_PARAM ptr, p, malloc_usable_size(ptr));
  /* Free old region */
  tanger_stm_free(ptr);

  return p;
}

/* Cleaning macros */
#undef TM_LOAD
#undef TM_STORE


================================================
FILE: stms/tinystm/abi/dtmc/tanger.h
================================================
/*
 * File:
 *   tanger.h
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * Description:
 *   Tanger adapter for tinySTM.
 *
 * Copyright (c) 2007-2014.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * This program has a dual license and can also be distributed
 * under the terms of the MIT license.
 */

#ifndef _TANGER_H_
# define _TANGER_H_

typedef struct {
  void *stack_addr;
  size_t stack_size;
  void *data;
  size_t data_size;
} appstack_t;

extern __thread appstack_t appstack;

static inline void tanger_stm_save_stack()
{
  // Is the data big enough?
  if (appstack.stack_size > appstack.data_size) {
    // TODO round to 4096+
    appstack.data_size = appstack.stack_size;
    appstack.data = realloc(appstack.data, appstack.data_size);
  }
  __builtin_memcpy(appstack.data, appstack.stack_addr, appstack.stack_size);
}

static inline void tanger_stm_restore_stack()
{
  __builtin_memcpy(appstack.stack_addr, appstack.data, appstack.stack_size);
}

static inline void tanger_stm_reset_stack()
{
  appstack.stack_addr = NULL;
}

static inline void tanger_stm_free_stack()
{
  if (appstack.data) {
    free(appstack.data);
    appstack.data = NULL;
  }
}

#endif /* _TANGER_H_ */


================================================
FILE: stms/tinystm/abi/dtmc/tm_macros.h
================================================
/*
 * File:
 *   tm_macros.h
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * Description:
 *   Defines macros for transactional operations.
 *
 * Copyright (c) 2007-2014.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * This program has a dual license and can also be distributed
 * under the terms of the MIT license.
 */

#ifndef _TM_MACROS_H_
# define _TM_MACROS_H_

# define TM_START(id,ro)                    __tm_atomic {
# define TM_LOAD(x)                         *x
# define TM_STORE(x,y)                      *x=y
# define TM_COMMIT                          }
# define TM_MALLOC(size)                    malloc(size)
# define TM_FREE(addr)                      free(addr)
# define TM_FREE2(addr, size)               free(addr)

# define TM_INIT
# define TM_EXIT
# define TM_INIT_THREAD
# define TM_EXIT_THREAD

/* Define Annotations */
# define TM_PURE                            __attribute__((tm_pure))
# define TM_SAFE                            __attribute__((tm_callable))

/* FIXME to be removed when DTMC will support annotations */
static double tanger_wrapperpure_erand48(unsigned short int __xsubi[3]) __attribute__ ((weakref("erand48")));

#endif /* _TM_MACROS_H_ */


================================================
FILE: stms/tinystm/abi/gcc/Makefile
================================================
# Path to TinySTM
ROOT = ../..

.PHONY:	gcc all clean check test

all:	gcc

# ROOT must be defined to include Makefile.common
include $(ROOT)/abi/Makefile.common


##############################################################################
## GCC
##############################################################################
CPPFLAGS += -DTM_GCC -I.
INCLUDES += $(wildcard ../../src/*.h)

# NOTES
#   lib.map enables to export only some functions
gcc: 	libitm.so libitm.a 

arch.o: arch.S
	$(CC) $(CPPFLAGS) $(CFLAGS) $(DEF_ABI) -c -o $@ $<

%.do: 	../%.c $(INCLUDES)
	$(CC) -fPIC $(CPPFLAGS) $(CFLAGS) $(DEF_ABI) -c -o $@ $<

%.o: 	../%.c $(INCLUDES)
	$(CC) $(CPPFLAGS) $(CFLAGS) $(DEF_ABI) -c -o $@ $<

%.s: 	../%.c 
	$(CC) $(CPPFLAGS) $(CFLAGS) $(DEF_ABI) -fverbose-asm -S -o $@ $<

libitm.a: abi.o arch.o 
	$(AR) cru $@ $^

libitm.so: 	abi.do arch.o 
	$(CC) -fPIC $(CPPFLAGS) $(CFLAGS) -shared -Wl,--version-script,../lib.map -o $@ $^
# TODO Check if strip is really needed
#	strip $@
#	cp libitm.so libitm.so.1
#	ln -s libitm.so libitm.so.1
#TODO for FAT filesystem, ln doesn't work
##############################################################################

TESTCC       ?= $(CC)
TESTCFLAGS   += $(CPPFLAGS) $(CFLAGS) -DTM_GCC -fgnu-tm
TESTLD       ?= $(LD)
TESTLDFLAGS  += -Wl,-rpath=$(shell pwd)
TESTLDFLAGS  += -L$(ROOT)/abi/gcc 
TESTLDFLAGS  += -static -litm

clean: 	intset-clean
	rm -f *.o *.do libitm.a libitm.so


================================================
FILE: stms/tinystm/abi/gcc/alloc_cpp.c
================================================
/*
 * File:
 *   alloc_cpp.c
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * Description:
 *   Module for C++ dynamic memory management.
 *
 * Copyright (c) 2007-2014.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * This program has a dual license and can also be distributed
 * under the terms of the MIT license.
 */

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>

#include "stm.h"

/* ################################################################### *
 * TYPES
 * ################################################################### */

typedef struct mod_alloc_block {          /* Block of allocated memory */
  void *addr;                             /* Address of memory */
  void (*rev_func)(void*);                /* Undo/Defered function */
  struct mod_alloc_block *next;           /* Next block */
} mod_alloc_block_t;

typedef struct mod_alloc_info {           /* Memory descriptor */
  mod_alloc_block_t *allocated;           /* Memory allocated by this transation (freed upon abort) */
  mod_alloc_block_t *freed;               /* Memory freed by this transation (freed upon commit) */
} mod_alloc_info_t;

static int mod_alloc_key;
static int mod_alloc_initialized = 0;

/* ################################################################### *
 * FUNCTIONS
 * ################################################################### */

/* TODO this is only true on linux amd64/ia32 */
#ifdef __LP64__
# define CREATENAME(A,B) A##m##B
#else /* ! __LP64__ */
# define CREATENAME(A,B) A##j##B
#endif /* ! __LP64__ */


typedef const struct nothrow_t { } *c_nothrow_p;
extern void *CREATENAME(_Znw,) (size_t) __attribute__((weak));
extern void *CREATENAME(_Zna,) (size_t) __attribute__((weak));
extern void *CREATENAME(_Znw,RKSt9nothrow_t) (size_t, c_nothrow_p) __attribute__((weak));
extern void *CREATENAME(_Zna,RKSt9nothrow_t) (size_t, c_nothrow_p) __attribute__((weak));

extern void _ZdlPv (void *) __attribute__((weak));
extern void _ZdaPv (void *) __attribute__((weak));
extern void _ZdlPvRKSt9nothrow_t (void *, c_nothrow_p) __attribute__((weak));
extern void _ZdaPvRKSt9nothrow_t (void *, c_nothrow_p) __attribute__((weak));

static void mod_alloc_record(void *ptr, void (*rev_func)(void*))
{
  /* Memory will be freed upon abort */
  mod_alloc_info_t *mi;
  mod_alloc_block_t *mb;

  if (!mod_alloc_initialized) {
    fprintf(stderr, "Module mod_alloc not initialized\n");
    exit(1);
  }

  mi = (mod_alloc_info_t *)stm_get_specific(mod_alloc_key);
  assert(mi != NULL);

  if ((mb = (mod_alloc_block_t *)malloc(sizeof(mod_alloc_block_t))) == NULL) {
    perror("malloc");
    exit(1);
  }
  mb->addr = ptr;
  mb->rev_func = rev_func;
  mb->next = mi->allocated;
  mi->allocated = mb;
}

static void mod_free_record(void *addr, void (*rev_func)(void*))
{
  /* Memory disposal is delayed until commit */
  mod_alloc_info_t *mi;
  mod_alloc_block_t *mb;

  if (!mod_alloc_initialized) {
    fprintf(stderr, "Module mod_alloc not mod_alloc_initialized\n");
    exit(1);
  }

  mi = (mod_alloc_info_t *)stm_get_specific(mod_alloc_key);
  assert(mi != NULL);

  /* Overwrite to prevent inconsistent reads */
  /* TODO delete operators doesn't give the allocated size */
  /* Acquire lock and update version number */
  stm_store2(addr, 0, 0);
  /* Schedule for removal */
  if ((mb = (mod_alloc_block_t *)malloc(sizeof(mod_alloc_block_t))) == NULL) {
    perror("malloc");
    exit(1);
  }
  mb->addr = addr;
  mb->rev_func = rev_func;
  mb->next = mi->freed;
  mi->freed = mb;
}

/*
 * Called upon thread creation.
 */
static void mod_alloc_on_thread_init(void *arg)
{
  mod_alloc_info_t *mi;

  if ((mi = (mod_alloc_info_t *)malloc(sizeof(mod_alloc_info_t))) == NULL) {
    perror("malloc");
    exit(1);
  }
  mi->allocated = mi->freed = NULL;

  stm_set_specific(mod_alloc_key, mi);
}

/*
 * Called upon thread deletion.
 */
static void mod_alloc_on_thread_exit(void *arg)
{
  free(stm_get_specific(mod_alloc_key));
}

/*
 * Called upon transaction commit.
 */
static void mod_alloc_on_commit(void *arg)
{
  mod_alloc_info_t *mi;
  mod_alloc_block_t *mb, *next;

  mi = (mod_alloc_info_t *)stm_get_specific(mod_alloc_key);
  assert(mi != NULL);

  /* Keep memory allocated during transaction */
  if (mi->allocated != NULL) {
    mb = mi->allocated;
    while (mb != NULL) {
      next = mb->next;
      free(mb);
      mb = next;
    }
    mi->allocated = NULL;
  }

  /* Dispose of memory freed during transaction */
  if (mi->freed != NULL) {
    mb = mi->freed;
    while (mb != NULL) {
      next = mb->next;
      mb->rev_func(mb->addr);
      free(mb);
      mb = next;
    }
    mi->freed = NULL;
  }
}

/*
 * Called upon transaction abort.
 */
static void mod_alloc_on_abort(void *arg)
{
  mod_alloc_info_t *mi;
  mod_alloc_block_t *mb, *next;

  mi = (mod_alloc_info_t *)stm_get_specific(mod_alloc_key);
  assert (mi != NULL);

  /* Dispose of memory allocated during transaction */
  if (mi->allocated != NULL) {
    mb = mi->allocated;
    while (mb != NULL) {
      next = mb->next;
      mb->rev_func(mb->addr);
      free(mb);
      mb = next;
    }
    mi->allocated = NULL;
  }

  /* Keep memory freed during transaction */
  if (mi->freed != NULL) {
    mb = mi->freed;
    while (mb != NULL) {
      next = mb->next;
      free(mb);
      mb = next;
    }
    mi->freed = NULL;
  }
}

/* New operators */

void *CREATENAME(_ZGTtnw,) (size_t sz)
{
  void *alloc;
  alloc = CREATENAME(_Znw,)(sz);
  mod_alloc_record(alloc, _ZdlPv);
  return alloc;
}

void *CREATENAME(_ZGTtna,) (size_t sz)
{
  void *alloc;
  alloc = CREATENAME(_Zna,)(sz);
  mod_alloc_record(alloc, _ZdaPv);
  return alloc;
}

static void _ZdlPvRKSt9nothrow_t1(void *ptr)
{ 
  _ZdlPvRKSt9nothrow_t (ptr, NULL);
}

void *CREATENAME(_ZGTtnw,RKSt9nothrow_t) (size_t sz, c_nothrow_p nt)
{
  void *alloc;
  alloc = CREATENAME(_Znw,RKSt9nothrow_t)(sz, nt);
  mod_alloc_record(alloc, _ZdlPvRKSt9nothrow_t1);
  return alloc;
}

static void _ZdaPvRKSt9nothrow_t1(void *ptr)
{
  _ZdaPvRKSt9nothrow_t(ptr, NULL);
}

void *CREATENAME(_ZGTtna,RKSt9nothrow_t)(size_t sz, c_nothrow_p nt)
{
  void *alloc;
  alloc = CREATENAME(_Zna,RKSt9nothrow_t)(sz, nt);
  mod_alloc_record(alloc, _ZdaPvRKSt9nothrow_t1);
  return alloc;
}

/* Delete operators */

void
_ZGTtdlPv (void *ptr)
{
  mod_free_record(ptr, _ZdlPv);
}

void
_ZGTtdlPvRKSt9nothrow_t (void *ptr, c_nothrow_p nt)
{ 
  mod_free_record(ptr, _ZdlPvRKSt9nothrow_t1);
}

void _ZGTtdaPv(void *ptr)
{
  mod_free_record(ptr, _ZdaPv);
}

void
_ZGTtdaPvRKSt9nothrow_t (void *ptr, c_nothrow_p nt)
{
  mod_free_record(ptr, _ZdaPvRKSt9nothrow_t1);
}

/*
 * Initialize module.
 */
void mod_alloc_cpp()
{
  if (mod_alloc_initialized)
    return;

  stm_register(mod_alloc_on_thread_init, mod_alloc_on_thread_exit, NULL, NULL, mod_alloc_on_commit, mod_alloc_on_abort, NULL);
  mod_alloc_key = stm_create_specific();
  if (mod_alloc_key < 0) {
    fprintf(stderr, "Cannot create specific key\n");
    exit(1);
  }
  mod_alloc_initialized = 1;
}


================================================
FILE: stms/tinystm/abi/gcc/arch.S
================================================
#ifndef __linux__
/* Linux specific, Windows uses a different calling convention. */
/* TODO probably solaris is fine with it. */
# error This file is linux specific.
#endif

	.text

#if defined(__x86_64__)
	.align 4
	.globl	_ITM_beginTransaction
	.type	_ITM_beginTransaction, @function

_ITM_beginTransaction:
	.cfi_startproc
/* Paramters (in order) is in rdi, rsi, rdx, rcx, r8, r9 */
/* Temporary registers is r10, r11 (not saved) */
/* To be saved are rbx, rsp, rbp, r12, r13, r14, r15 */
	leaq	8(%rsp), %rax    /* Save stack pointer */
	subq	$56, %rsp
	.cfi_def_cfa_offset 64
	movq	%rax, (%rsp)
	movq	%rbx, 8(%rsp)
	movq	%rbp, 16(%rsp)
	movq	%r12, 24(%rsp)
	movq	%r13, 32(%rsp)
	movq	%r14, 40(%rsp)
	movq	%r15, 48(%rsp)
	movq	%rsp, %rsi
	call	GTM_begin_transaction
	addq	$56, %rsp
	.cfi_def_cfa_offset 8
	ret
	.cfi_endproc
	.size	_ITM_beginTransaction, .-_ITM_beginTransaction

	.align 4
	.globl	_ITM_siglongjmp
	.type	_ITM_siglongjmp, @function
	.hidden	_ITM_siglongjmp

_ITM_siglongjmp:
	.cfi_startproc
	movq	(%rsi), %rcx
	movq	8(%rsi), %rbx
	movq	16(%rsi), %rbp
	movq	24(%rsi), %r12
	movq	32(%rsi), %r13
	movq	40(%rsi), %r14
	movq	48(%rsi), %r15
	movl	%edi, %eax
	.cfi_def_cfa %rsi, 0
	.cfi_offset %rip, 56
	.cfi_register %rsp, %rcx
	movq	%rcx, %rsp
	jmp	*56(%rsi)
	.cfi_endproc
	.size	_ITM_siglongjmp, .-_ITM_siglongjmp

#elif defined(__i386__)
	.align 4
	.globl	_ITM_beginTransaction
	.type	_ITM_beginTransaction, @function
_ITM_beginTransaction:
	.cfi_startproc
	leal	4(%esp), %ecx
	movl	4(%esp), %eax
	subl	$28, %esp
	.cfi_def_cfa_offset 32
	movl	%ecx, 8(%esp)
	movl	%ebx, 12(%esp)
	movl	%esi, 16(%esp)
	movl	%edi, 20(%esp)
	movl	%ebp, 24(%esp)
	leal	8(%esp), %edx
	call	GTM_begin_transaction
	addl	$28, %esp
	.cfi_def_cfa_offset 4
	ret
	.cfi_endproc
	.size	_ITM_beginTransaction, .-_ITM_beginTransaction


	.align 4
	.globl	_ITM_siglongjmp
	.type	_ITM_siglongjmp, @function
	.hidden	_ITM_siglongjmp
_ITM_siglongjmp:
	.cfi_startproc
	movl	(%edx), %ecx
	movl	4(%edx), %ebx
	movl	8(%edx), %esi
	movl	12(%edx), %edi
	movl	16(%edx), %ebp
	.cfi_def_cfa %edx, 0
	.cfi_offset %eip, 20
	.cfi_register %esp, %ecx
	movl	%ecx, %esp
	jmp	*20(%edx)
	.cfi_endproc
	.size	_ITM_siglongjmp, .-_ITM_siglongjmp

#elif defined(__sparc__)
# error SPARC arch is not yet supported
#else
# error Unsupported architecture
#endif

#ifdef __linux__
.section .note.GNU-stack, "", @progbits
#endif


================================================
FILE: stms/tinystm/abi/gcc/clone.c
================================================
/* Copyright (C) 2009, 2010, 2011 Free Software Foundation, Inc.
   Contributed by Richard Henderson <rth@redhat.com>.

   This file is part of the GNU Transactional Memory Library (libitm).

   Libitm is free software; you can redistribute it and/or modify it
   under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3 of the License, or
   (at your option) any later version.

   Libitm is distributed in the hope that it will be useful, but WITHOUT ANY
   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
   more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.

   This file was modified to allow compatibility with the GNU Transactional
   Memory Library (libitm). */

/* No include needed since the file is included */

struct clone_entry
{
  void *orig, *clone;
};

struct clone_table
{
  struct clone_entry *table;
  size_t size;
  struct clone_table *next;
};

static struct clone_table *all_tables;

static void *
find_clone (void *ptr)
{
  struct clone_table *table;

  for (table = all_tables; table ; table = table->next)
    {
      struct clone_entry *t = table->table;
      size_t lo = 0, hi = table->size, i;

      /* Quick test for whether PTR is present in this table.  */
      if (ptr < t[0].orig || ptr > t[hi - 1].orig)
	continue;

      /* Otherwise binary search.  */
      while (lo < hi)
	{
	  i = (lo + hi) / 2;
	  if (ptr < t[i].orig)
	    hi = i;
	  else if (ptr > t[i].orig)
	    lo = i + 1;
	  else
	    {
	      return t[i].clone;
	    }
	}

      /* Given the quick test above, if we don't find the entry in
	 this table then it doesn't exist.  */
      break;
    }
  return NULL;
}


void * _ITM_CALL_CONVENTION
_ITM_getTMCloneOrIrrevocable (void *ptr)
{
  // if the function (ptr) have a TM version, give the pointer to the TM function 
  // otherwise, set transaction to irrevocable mode
  void *ret = find_clone (ptr);
  if (ret)
    return ret;

  /* TODO Check we are in an active transaction */
  //  if (stm_current_tx() != NULL && stm_is_active(tx))
    /* GCC always use implicit transaction descriptor */
    stm_set_irrevocable(1);

  return ptr;
}

void * _ITM_CALL_CONVENTION
_ITM_getTMCloneSafe (void *ptr)
{
  void *ret = find_clone(ptr);
  if (ret == NULL) {
    fprintf(stderr, "libitm: cannot find clone for %p\n", ptr);
    abort();
  }
  return ret;
}

static int
clone_entry_compare (const void *a, const void *b)
{
  const struct clone_entry *aa = (const struct clone_entry *)a;
  const struct clone_entry *bb = (const struct clone_entry *)b;

  if (aa->orig < bb->orig)
    return -1;
  else if (aa->orig > bb->orig)
    return 1;
  else
    return 0;
}

void
_ITM_registerTMCloneTable (void *xent, size_t size)
{
  struct clone_entry *ent = (struct clone_entry *)(xent);
  struct clone_table *old, *table;

  table = (struct clone_table *) malloc (sizeof (struct clone_table));
  table->table = ent;
  table->size = size;

  qsort (ent, size, sizeof (struct clone_entry), clone_entry_compare);

  old = all_tables;
  do
    {
      table->next = old;
      /* TODO Change to use AtomicOps wrapper */
      old = __sync_val_compare_and_swap (&all_tables, old, table);
    }
  while (old != table);
}

void
_ITM_deregisterTMCloneTable (void *xent)
{
  struct clone_entry *ent = (struct clone_entry *)(xent);
  struct clone_table **pprev = &all_tables;
  struct clone_table *tab;

  /* FIXME: we must make sure that no transaction is active at this point. */

  for (pprev = &all_tables;
       tab = *pprev, tab->table != ent;
       pprev = &tab->next)
    continue;
  *pprev = tab->next;

  free (tab);
}


================================================
FILE: stms/tinystm/abi/gcc/eh.c
================================================
/* Copyright (C) 2009, 2011, 2012 Free Software Foundation, Inc.
   Contributed by Richard Henderson <rth@redhat.com>.

   This file is part of the GNU Transactional Memory Library (libitm).

   Libitm is free software; you can redistribute it and/or modify it
   under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 3 of the License, or
   (at your option) any later version.

   Libitm is distributed in the hope that it will be useful, but WITHOUT ANY
   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
   more details.

   Under Section 7 of GPL version 3, you are granted additional
   permissions described in the GCC Runtime Library Exception, version
   3.1, as published by the Free Software Foundation.

   You should have received a copy of the GNU General Public License and
   a copy of the GCC Runtime Library Exception along with this program;
   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
   <http://www.gnu.org/licenses/>.

   This file was modified to allow compatibility with the GNU Transactional
   Memory Library (libitm). */

/* FIXME Exception support with GCC is not working. */

extern void *__cxa_allocate_exception (size_t) __attribute__((weak));
extern void __cxa_throw (void *, void *, void *) __attribute__((weak));
extern void *__cxa_begin_catch (void *) __attribute__((weak));
extern void *__cxa_end_catch (void) __attribute__((weak));
/* TODO check if TM_ABI -> no problem of dependancy of gcc-tm  */
extern void __cxa_tm_cleanup (void *, void *, unsigned int) __attribute__((weak));


void *_ITM_cxa_allocate_exception (size_t size)
{
  void *r = __cxa_allocate_exception (size);
  /*tx->cxa_unthrown = r;*/
  return r;
}

void _ITM_cxa_throw (void *obj, void *tinfo, void *dest)
{
  /*tx->cxa_unthrown = NULL;*/
  __cxa_throw (obj, tinfo, dest);
}

void *_ITM_cxa_begin_catch (void *exc_ptr)
{
  /*tx->cxa_catch_count++;*/
  return __cxa_begin_catch (exc_ptr);
}

void _ITM_cxa_end_catch (void)
{
  /*tx->cxa_catch_count--;*/
  __cxa_end_catch ();
}

/* On rollback */
/*
 * TODO integrate this to completely makes work exception with GCC-TM
void stm_revert_cpp_exceptions (void)
{   
  if (tx->cxa_unthrown || tx->cxa_catch_count) {
    __cxa_tm_cleanup (tx->cxa_unthrown, tx->eh_in_flight,
                      tx->cxa_catch_count);
    tx->cxa_catch_count = 0;
    tx->cxa_unthrown = NULL;
    tx->eh_in_flight = NULL;
  }
  if (tx->eh_in_flight) {
    _Unwind_DeleteException ((_Unwind_Exception *) tx->eh_in_flight);
    tx->eh_in_flight = NULL;
  }
}

in _ITM_commitTransactionEH
tx->eh_in_flight = exc_ptr;
in _ITM_beginTransaction (no nesting)
tx->eh_in_flight = NULL;
*/


================================================
FILE: stms/tinystm/abi/gcc/libitm.h
================================================
/*
 * File:
 *   libitm.h
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * Description:
 *   ABI for tinySTM.
 *
 * Copyright (c) 2007-2014.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * This program has a dual license and can also be distributed
 * under the terms of the MIT license.
 */

#ifndef _LIBITM_H_
#define _LIBITM_H_

#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */

#include <stdint.h>
#include <stdbool.h>
#ifdef __SSE__
# include <xmmintrin.h>
#endif

/* ################################################################### *
 * DEFINES
 * ################################################################### */
#define _ITM_VERSION_NO_STR "1.0.4"
#define _ITM_VERSION_NO 104

#if defined(__i386__)
# define _ITM_CALL_CONVENTION __attribute__((regparm(2)))
#else
# define _ITM_CALL_CONVENTION
#endif

#define _ITM_noTransactionId 1		/* Id for non-transactional code. */


#define _ITM_TRANSACTION_PURE __attribute__((transaction_pure))

/* ################################################################### *
 * TYPES
 * ################################################################### */

typedef void *_ITM_transaction;

typedef void (*_ITM_userUndoFunction)(void *);
typedef void (*_ITM_userCommitFunction)(void *);

typedef uint32_t _ITM_transactionId;

typedef enum
{
  outsideTransaction = 0,
  inRetryableTransaction,
  inIrrevocableTransaction
} _ITM_howExecuting;

struct _ITM_srcLocationS
{
  int32_t reserved_1;
  int32_t flags;
  int32_t reserved_2;
  int32_t reserved_3;
  const char *psource;
};

typedef struct _ITM_srcLocationS _ITM_srcLocation;

typedef enum {
  pr_instrumentedCode = 0x0001,
  pr_uninstrumentedCode = 0x0002,
  pr_multiwayCode = pr_instrumentedCode | pr_uninstrumentedCode,
  pr_hasNoXMMUpdate = 0x0004,
  pr_hasNoAbort = 0x0008,
  pr_hasNoRetry = 0x0010,
  pr_hasNoIrrevocable = 0x0020,
  pr_doesGoIrrevocable = 0x0040,
  pr_hasNoSimpleReads = 0x0080,
  pr_aWBarriersOmitted = 0x0100,
  pr_RaRBarriersOmitted = 0x0200,
  pr_undoLogCode = 0x0400,
  pr_preferUninstrumented = 0x0800,
  pr_exceptionBlock = 0x1000,
  pr_hasElse = 0x2000,
  pr_readOnly = 0x4000 /* GNU gcc specific */
} _ITM_codeProperties;

typedef enum {
  a_runInstrumentedCode = 0x01,
  a_runUninstrumentedCode = 0x02,
  a_saveLiveVariables = 0x04,
  a_restoreLiveVariables = 0x08,
  a_abortTransaction = 0x10,
} _ITM_actions;

typedef enum {
  modeSerialIrrevocable,
  modeObstinate,
  modeOptimistic,
  modePessimistic,
} _ITM_transactionState;

typedef enum {
  unknown = 0,
  userAbort = 1,
  userRetry = 2,
  TMConflict= 4,
  exceptionBlockAbort = 8
} _ITM_abortReason;


/* ################################################################### *
 * FUNCTIONS
 * ################################################################### */

extern _ITM_TRANSACTION_PURE
_ITM_transaction * _ITM_CALL_CONVENTION _ITM_getTransaction(void);

extern _ITM_TRANSACTION_PURE
_ITM_howExecuting _ITM_CALL_CONVENTION _ITM_inTransaction();

extern _ITM_TRANSACTION_PURE
int _ITM_CALL_CONVENTION _ITM_getThreadnum(void);

extern _ITM_TRANSACTION_PURE
void _ITM_CALL_CONVENTION _ITM_addUserCommitAction(
                             _ITM_userCommitFunction __commit,
                             _ITM_transactionId resumingTransactionId,
                             void *__arg);

extern _ITM_TRANSACTION_PURE
void _ITM_CALL_CONVENTION _ITM_addUserUndoAction(
                             const _ITM_userUndoFunction __undo, void * __arg);

extern _ITM_TRANSACTION_PURE
_ITM_transactionId _ITM_CALL_CONVENTION _ITM_getTransactionId();

extern _ITM_TRANSACTION_PURE
void _ITM_CALL_CONVENTION _ITM_dropReferences(
                             const void *__start, size_t __size);

extern _ITM_TRANSACTION_PURE
void _ITM_CALL_CONVENTION _ITM_userError(const char *errString, int exitCode);

extern const char * _ITM_CALL_CONVENTION _ITM_libraryVersion(void);

extern int _ITM_CALL_CONVENTION _ITM_versionCompatible(int version);


extern int _ITM_CALL_CONVENTION _ITM_initializeThread(void);

extern void _ITM_CALL_CONVENTION _ITM_finalizeThread(void);

extern void _ITM_CALL_CONVENTION _ITM_finalizeProcess(void);

extern int _ITM_CALL_CONVENTION _ITM_initializeProcess(void);

extern void _ITM_CALL_CONVENTION _ITM_error(const _ITM_srcLocation *__src,
                             int errorCode);

extern uint32_t _ITM_beginTransaction(uint32_t __properties, ...)
                             __attribute__((returns_twice));
extern void _ITM_CALL_CONVENTION _ITM_commitTransaction(void);


extern bool _ITM_CALL_CONVENTION _ITM_tryCommitTransaction(
                             const _ITM_srcLocation *__src);

extern void _ITM_CALL_CONVENTION _ITM_commitTransactionToId(
                             const _ITM_transactionId tid,
                             const _ITM_srcLocation *__src);

extern void _ITM_CALL_CONVENTION _ITM_abortTransaction(
                             _ITM_abortReason __reason,
                             const _ITM_srcLocation *__src);

extern void _ITM_CALL_CONVENTION _ITM_rollbackTransaction(
                             const _ITM_srcLocation *__src);

extern void _ITM_CALL_CONVENTION _ITM_registerThrownObject(
                             const void *__obj,
                             size_t __size);

extern void _ITM_CALL_CONVENTION _ITM_changeTransactionMode(
                             _ITM_transactionState __mode,
                             const _ITM_srcLocation *__loc);

/**** GCC Specific ****/
extern _ITM_CALL_CONVENTION void *_ITM_getTMCloneOrIrrevocable(void *);
extern _ITM_CALL_CONVENTION void *_ITM_getTMCloneSafe(void *);
extern void _ITM_registerTMCloneTable(void *, size_t);
extern void _ITM_deregisterTMCloneTable(void *);
extern _ITM_CALL_CONVENTION void _ITM_commitTransactionEH(void *);

extern void * _ITM_malloc(size_t);
extern void * _ITM_calloc(size_t, size_t);
extern void _ITM_free(void *);

/*** Loads ***/

extern uint8_t _ITM_CALL_CONVENTION _ITM_RU1(const uint8_t *);
extern uint8_t _ITM_CALL_CONVENTION _ITM_RaRU1(const uint8_t *);
extern uint8_t _ITM_CALL_CONVENTION _ITM_RaWU1(const uint8_t *);
extern uint8_t _ITM_CALL_CONVENTION _ITM_RfWU1(const uint8_t *);

extern uint16_t _ITM_CALL_CONVENTION _ITM_RU2(const uint16_t *);
extern uint16_t _ITM_CALL_CONVENTION _ITM_RaRU2(const uint16_t *);
extern uint16_t _ITM_CALL_CONVENTION _ITM_RaWU2(const uint16_t *);
extern uint16_t _ITM_CALL_CONVENTION _ITM_RfWU2(const uint16_t *);

extern uint32_t _ITM_CALL_CONVENTION _ITM_RU4(const uint32_t *);
extern uint32_t _ITM_CALL_CONVENTION _ITM_RaRU4(const uint32_t *);
extern uint32_t _ITM_CALL_CONVENTION _ITM_RaWU4(const uint32_t *);
extern uint32_t _ITM_CALL_CONVENTION _ITM_RfWU4(const uint32_t *);

extern uint64_t _ITM_CALL_CONVENTION _ITM_RU8(const uint64_t *);
extern uint64_t _ITM_CALL_CONVENTION _ITM_RaRU8(const uint64_t *);
extern uint64_t _ITM_CALL_CONVENTION _ITM_RaWU8(const uint64_t *);
extern uint64_t _ITM_CALL_CONVENTION _ITM_RfWU8(const uint64_t *);

extern float _ITM_CALL_CONVENTION _ITM_RF(const float *);
extern float _ITM_CALL_CONVENTION _ITM_RaRF(const float *);
extern float _ITM_CALL_CONVENTION _ITM_RaWF(const float *);
extern float _ITM_CALL_CONVENTION _ITM_RfWF(const float *);

extern double _ITM_CALL_CONVENTION _ITM_RD(const double *);
extern double _ITM_CALL_CONVENTION _ITM_RaRD(const double *);
extern double _ITM_CALL_CONVENTION _ITM_RaWD(const double *);
extern double _ITM_CALL_CONVENTION _ITM_RfWD(const double *);

#ifdef __SSE__
extern __m64 _ITM_CALL_CONVENTION _ITM_RM64(const __m64 *);
extern __m64 _ITM_CALL_CONVENTION _ITM_RaRM64(const __m64 *);
extern __m64 _ITM_CALL_CONVENTION _ITM_RaWM64(const __m64 *);
extern __m64 _ITM_CALL_CONVENTION _ITM_RfWM64(const __m64 *);

extern __m128 _ITM_CALL_CONVENTION _ITM_RM128(const __m128 *);
extern __m128 _ITM_CALL_CONVENTION _ITM_RaRM128(const __m128 *);
extern __m128 _ITM_CALL_CONVENTION _ITM_RaWM128(const __m128 *);
extern __m128 _ITM_CALL_CONVENTION _ITM_RfWM128(const __m128 *);
#endif /* __SSE__ */

extern float _Complex _ITM_CALL_CONVENTION _ITM_RCF(const float _Complex *);
extern float _Complex _ITM_CALL_CONVENTION _ITM_RaRCF(const float _Complex *);
extern float _Complex _ITM_CALL_CONVENTION _ITM_RaWCF(const float _Complex *);
extern float _Complex _ITM_CALL_CONVENTION _ITM_RfWCF(const float _Complex *);

extern double _Complex _ITM_CALL_CONVENTION _ITM_RCD(const double _Complex *);
extern double _Complex _ITM_CALL_CONVENTION _ITM_RaRCD(const double _Complex *);
extern double _Complex _ITM_CALL_CONVENTION _ITM_RaWCD(const double _Complex *);
extern double _Complex _ITM_CALL_CONVENTION _ITM_RfWCD(const double _Complex *);

extern long double _Complex _ITM_CALL_CONVENTION _ITM_RCE(const long double _Complex *);
extern long double _Complex _ITM_CALL_CONVENTION _ITM_RaRCE(const long double _Complex *);
extern long double _Complex _ITM_CALL_CONVENTION _ITM_RaWCE(const long double _Complex *);
extern long double _Complex _ITM_CALL_CONVENTION _ITM_RfWCE(const long double _Complex *);


/*** Stores ***/

extern void _ITM_CALL_CONVENTION _ITM_WU1(const uint8_t *, uint8_t);
extern void _ITM_CALL_CONVENTION _ITM_WaRU1(const uint8_t *, uint8_t);
extern void _ITM_CALL_CONVENTION _ITM_WaWU1(const uint8_t *, uint8_t);

extern void _ITM_CALL_CONVENTION _ITM_WU2(const uint16_t *, uint16_t);
extern void _ITM_CALL_CONVENTION _ITM_WaRU2(const uint16_t *, uint16_t);
extern void _ITM_CALL_CONVENTION _ITM_WaWU2(const uint16_t *, uint16_t);

extern void _ITM_CALL_CONVENTION _ITM_WU4(const uint32_t *, uint32_t);
extern void _ITM_CALL_CONVENTION _ITM_WaRU4(const uint32_t *, uint32_t);
extern void _ITM_CALL_CONVENTION _ITM_WaWU4(const uint32_t *, uint32_t);

extern void _ITM_CALL_CONVENTION _ITM_WU8(const uint64_t *, uint64_t);
extern void _ITM_CALL_CONVENTION _ITM_WaRU8(const uint64_t *, uint64_t);
extern void _ITM_CALL_CONVENTION _ITM_WaWU8(const uint64_t *, uint64_t);

extern void _ITM_CALL_CONVENTION _ITM_WF(const float *, float);
extern void _ITM_CALL_CONVENTION _ITM_WaRF(const float *, float);
extern void _ITM_CALL_CONVENTION _ITM_WaWF(const float *, float);

extern void _ITM_CALL_CONVENTION _ITM_WD(const double *, double);
extern void _ITM_CALL_CONVENTION _ITM_WaRD(const double *, double);
extern void _ITM_CALL_CONVENTION _ITM_WaWD(const double *, double);

#ifdef __SSE__
extern void _ITM_CALL_CONVENTION _ITM_WM64(const __m64 *, __m64);
extern void _ITM_CALL_CONVENTION _ITM_WaRM64(const __m64 *, __m64);
extern void _ITM_CALL_CONVENTION _ITM_WaWM64(const __m64 *, __m64);

extern void _ITM_CALL_CONVENTION _ITM_WM128(const __m128 *, __m128);
extern void _ITM_CALL_CONVENTION _ITM_WaRM128(const __m128 *, __m128);
extern void _ITM_CALL_CONVENTION _ITM_WaWM128(const __m128 *, __m128);
#endif /* __SSE__ */

extern void _ITM_CALL_CONVENTION _ITM_WCF(const float _Complex *, float _Complex);
extern void _ITM_CALL_CONVENTION _ITM_WaRCF(const float _Complex *, float _Complex);
extern void _ITM_CALL_CONVENTION _ITM_WaWCF(const float _Complex *, float _Complex);

extern void _ITM_CALL_CONVENTION _ITM_WCD(const double _Complex *, double _Complex);
extern void _ITM_CALL_CONVENTION _ITM_WaRCD(const double _Complex *, double _Complex);
extern void _ITM_CALL_CONVENTION _ITM_WaWCD(const double _Complex *, double _Complex);

extern void _ITM_CALL_CONVENTION _ITM_WCE(const long double _Complex *, long double _Complex);
extern void _ITM_CALL_CONVENTION _ITM_WaRCE(const long double _Complex *, long double _Complex);
extern void _ITM_CALL_CONVENTION _ITM_WaWCE(const long double _Complex *, long double _Complex);


/*** Logging functions ***/

extern void _ITM_CALL_CONVENTION _ITM_LU1(const uint8_t *);
extern void _ITM_CALL_CONVENTION _ITM_LU2(const uint16_t *);
extern void _ITM_CALL_CONVENTION _ITM_LU4(const uint32_t *);
extern void _ITM_CALL_CONVENTION _ITM_LU8(const uint64_t *);
extern void _ITM_CALL_CONVENTION _ITM_LF(const float *);
extern void _ITM_CALL_CONVENTION _ITM_LD(const double *);
extern void _ITM_CALL_CONVENTION _ITM_LE(const long double *);
extern void _ITM_CALL_CONVENTION _ITM_LM64(const __m64 *);
extern void _ITM_CALL_CONVENTION _ITM_LM128(const __m128 *);
extern void _ITM_CALL_CONVENTION _ITM_LCF(const float _Complex *);
extern void _ITM_CALL_CONVENTION _ITM_LCD(const double _Complex *);
extern void _ITM_CALL_CONVENTION _ITM_LCE(const long double _Complex *);
extern void _ITM_CALL_CONVENTION _ITM_LB(const void *, size_t);


/*** memcpy functions ***/

extern void _ITM_CALL_CONVENTION _ITM_memcpyRnWt(void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memcpyRnWtaR(void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memcpyRnWtaW(void *, const void *, size_t);

extern void _ITM_CALL_CONVENTION _ITM_memcpyRtWn(void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memcpyRtaRWn(void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memcpyRtaWWn(void *, const void *, size_t);

extern void _ITM_CALL_CONVENTION _ITM_memcpyRtWt(void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memcpyRtWtaR(void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memcpyRtWtaW(void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memcpyRtaRWt(void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memcpyRtaRWtaR(void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memcpyRtaRWtaW(void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memcpyRtaWWt(void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memcpyRtaWWtaR(void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memcpyRtaWWtaW(void *, const void *, size_t);


/*** memset functions ***/

extern void _ITM_CALL_CONVENTION _ITM_memsetW(void *, int, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memsetWaR(void *, int, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memsetWaW(void *, int, size_t);


/*** memmove functions ***/

extern void _ITM_CALL_CONVENTION _ITM_memmoveRnWt(void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memmoveRnWtaR(void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memmoveRnWtaW(void *, const void *, size_t);

extern void _ITM_CALL_CONVENTION _ITM_memmoveRtWn(void *, const void *, size_t); 
extern void _ITM_CALL_CONVENTION _ITM_memmoveRtaRWn(void *, const void *, size_t); 
extern void _ITM_CALL_CONVENTION _ITM_memmoveRtaWWn(void *, const void *, size_t); 

extern void _ITM_CALL_CONVENTION _ITM_memmoveRtWt(void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memmoveRtWtaR(void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memmoveRtWtaW(void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memmoveRtaRWt(void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memmoveRtaRWtaR(void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memmoveRtaRWtaW(void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memmoveRtaWWt(void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memmoveRtaWWtaR(void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memmoveRtaWWtaW(void *, const void *, size_t);


#ifdef __cplusplus
} /* extern "C" */
#endif /* __cplusplus */

#endif /* _LIBITM_H_ */


================================================
FILE: stms/tinystm/abi/gcc/tm_macros.h
================================================
/*
 * File:
 *   tm_macros.h
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * Description:
 *   Defines macros for transactional operations.
 *
 * Copyright (c) 2007-2014.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * This program has a dual license and can also be distributed
 * under the terms of the MIT license.
 */
#ifndef _TM_MACROS_H_
# define _TM_MACROS_H_

# define TM_START(id,ro)                    __transaction_atomic {
# define TM_LOAD(x)                         *x
# define TM_STORE(x,y)                      *x=y
# define TM_COMMIT                          }
# define TM_MALLOC(size)                    malloc(size)
// TODO is it possible to do TM_FREE(addr ...)  free(addr) ? //__VA_ARGS__
# define TM_FREE(addr)                      free(addr)
# define TM_FREE2(addr, size)               free(addr)

# define TM_INIT
# define TM_EXIT
# define TM_INIT_THREAD
# define TM_EXIT_THREAD

/* Define Annotations */
# define TM_PURE                            __attribute__((transaction_pure))
# define TM_SAFE                            __attribute__((transaction_safe))

#endif /* _TM_MACROS_H_ */


================================================
FILE: stms/tinystm/abi/intel/Makefile
================================================
# Path to tinySTM
ROOT = ../..

.PHONY:	intel all clean check test

all:	intel 

# ROOT must be defined to include Makefile.common
include $(ROOT)/abi/Makefile.common


##############################################################################
## INTEL
##############################################################################
CPPFLAGS += -DTM_INTEL -I.

# NOTES
#   Intel STM Compiler uses explicit transaction descriptor
#   Two ways to use:
#   - Compile STM application using libitm.a
#   - Compile STM application using Intel STM and change LD_LIBRARY_PATH to use tiny's libitmdyn.so
# FIXME 
#   .so version need minor modification to stm_tx_t structure (add vtable at 0x10)
# intel/libitmdyn.so
intel: 	libitm.a

%.do: 	../%.c 
	$(CC) -fPIC $(CPPFLAGS) $(CFLAGS) $(DEF_ABI) -c -o $@ $<

%.o: 	../%.c 
	$(CC) $(CPPFLAGS) $(CFLAGS) $(DEF_ABI) -c -o $@ $<

# TODO the dependency of arch_x86.S is not set
arch.o: 	arch.S
	$(CC) $(CPPFLAGS) $(CFLAGS) -c -o $@ $<

libitm.a: 	abi.o arch.o alloc.o 
	$(AR) cru $@ $^

# TODO check the dynamic version (not really needed)
arch.do: 	arch.S
	$(CC) $(CPPFLAGS) $(CFLAGS) -c -o $@ $<

abi_dyn.o:	abi.c
	$(CC) -fPIC $(CPPFLAGS) $(CFLAGS) $(DEF_ABI) -c -o $@ $<

libitmdyn.so: 	abi_dyn.o arch.do
	$(CC) -fPIC $(CPPFLAGS) $(CFLAGS) -shared -Wl,--version-script,../lib.map -o $@ $^ 
##############################################################################

TESTCC          ?= icc
TESTCFLAGS      += -Qtm_enabled -DTM_INTEL

TESTLD          ?= icc
TESTLDFLAGS     += -Qtm_enabled -litm -L. 

clean: 	intset-clean
	rm -f *.o *.do libitm.a libitmdyn.so


================================================
FILE: stms/tinystm/abi/intel/alloc.c
================================================
#include <stdlib.h>
#include "mod_mem.h"

/* TODO make inline calls */

void *malloc_txn(size_t size) __asm__("malloc._$TXN");
void *malloc_txn(size_t size)
{
  return stm_malloc(size);
}

void *malloc_wraptxn(size_t size) __asm__("malloc._$WrapTXN");
void *malloc_wraptxn(size_t size)
{
  __asm__ __volatile__("jmp 1f\nmov $0xf0f0f0f0,%eax\n1:");
  return malloc_txn(size);
}

void *calloc_txn(size_t nmemb, size_t size) __asm__("calloc._$TXN");
void *calloc_txn(size_t nmemb, size_t size)
{
  return stm_calloc(nmemb, size);
}

void *calloc_wraptxn(size_t nmemb, size_t size) __asm__("calloc._$WrapTXN");
void *calloc_wraptxn(size_t nmemb, size_t size)
{
  __asm__ __volatile__("jmp 1f\nmov $0xf0f0f0f0,%eax\n1:");
  return calloc_txn(nmemb, size);
}

void free_txn(void *addr) __asm__("free._$TXN");
void free_txn(void *addr)
{
  stm_free(addr, sizeof(void *));
}

void free_wraptxn(void *addr) __asm__("free._$WrapTXN");
void free_wraptxn(void *addr)
{
  __asm__ __volatile__("jmp 1f\nmov $0xf0f0f0f0,%eax\n1:");
  free_txn(addr);
}

#if 0
/* TODO */
_mm_free._$TXN
_mm_free._$WrapTXN
_mm_malloc._$TXN
_mm_malloc._$WrapTXN

void* _mm_malloc (int size, int align)
void _mm_free (void *p)

_ZdaPv._$TXN
_ZdaPv._$WrapTXN
_ZdlPv._$TXN
_ZdlPv._$WrapTXN
_Znam._$TXN
_Znam._$WrapTXN
_Znwm._$TXN
_Znwm._$WrapTXN
#endif 


================================================
FILE: stms/tinystm/abi/intel/arch.S
================================================
#ifndef __linux__
/* Linux specific, Windows uses a different calling convention. */
/* TODO probably solaris is fine with it. */
# error This file is linux specific.
#endif

	.text

#if defined(__x86_64__)
	.align 4
	.globl	_ITM_beginTransaction
	.type	_ITM_beginTransaction, @function

_ITM_beginTransaction:
	.cfi_startproc
/* Parameters (in order) is in rdi, rsi, rdx, rcx, r8, r9 */
/* Temporary registers is r10, r11 (not saved) */
/* To be saved are rbx, rsp, rbp, r12, r13, r14, r15 */
	leaq	8(%rsp), %rax    /* Save stack pointer */
	subq	$56, %rsp
	.cfi_def_cfa_offset 64
	movq	%rax, (%rsp)
	movq	%rbx, 8(%rsp)
	movq	%rbp, 16(%rsp)
	movq	%r12, 24(%rsp)
	movq	%r13, 32(%rsp)
	movq	%r14, 40(%rsp)
	movq	%r15, 48(%rsp)
	movq	%rsp, %rdx
	call	GTM_begin_transaction
	addq	$56, %rsp
	.cfi_def_cfa_offset 8
	ret
	.cfi_endproc
	.size	_ITM_beginTransaction, .-_ITM_beginTransaction

	.align 4
	.globl	_ITM_siglongjmp
	.type	_ITM_siglongjmp, @function
	.hidden	_ITM_siglongjmp

_ITM_siglongjmp:
	.cfi_startproc
	movq	(%rsi), %rcx
	movq	8(%rsi), %rbx
	movq	16(%rsi), %rbp
	movq	24(%rsi), %r12
	movq	32(%rsi), %r13
	movq	40(%rsi), %r14
	movq	48(%rsi), %r15
	movl	%edi, %eax
	.cfi_def_cfa %rsi, 0
	.cfi_offset %rip, 56
	.cfi_register %rsp, %rcx
	movq	%rcx, %rsp
	/* a_restoreLiveVariables = 0x08 */
	orq 	$8, %rax
	jmp	*56(%rsi)
	.cfi_endproc
	.size	_ITM_siglongjmp, .-_ITM_siglongjmp

#elif defined(__i386__)
	.align 4
	.globl	_ITM_beginTransaction
	.type	_ITM_beginTransaction, @function
_ITM_beginTransaction:
	.cfi_startproc
	/* eax: tx, edx: attr, stack: __src */
	leal	4(%esp), %ecx
	subl	$28, %esp
	.cfi_def_cfa_offset 32
	movl	%ecx, 8(%esp)
	movl	%ebx, 12(%esp)
	movl	%esi, 16(%esp)
	movl	%edi, 20(%esp)
	movl	%ebp, 24(%esp)
	leal	8(%esp), %ecx
	push	%ecx
	/* GTM_begin_transaction(eax: tx, edx: attr, stack: jmp_buf) */
	call	GTM_begin_transaction
	addl	$32, %esp
	.cfi_def_cfa_offset 4
	ret
	.cfi_endproc
	.size	_ITM_beginTransaction, .-_ITM_beginTransaction


	.align 4
	.globl	_ITM_siglongjmp
	.type	_ITM_siglongjmp, @function
	.hidden	_ITM_siglongjmp
_ITM_siglongjmp:
	.cfi_startproc
	movl	(%edx), %ecx
	movl	4(%edx), %ebx
	movl	8(%edx), %esi
	movl	12(%edx), %edi
	movl	16(%edx), %ebp
	.cfi_def_cfa %edx, 0
	.cfi_offset %eip, 20
	.cfi_register %esp, %ecx
	movl	%ecx, %esp
	/* TODO check correctness of this behaviour, a_restoreLiveVariables = 0x08 */
	orl 	$8, %eax
	jmp	*20(%edx)
	.cfi_endproc
	.size	_ITM_siglongjmp, .-_ITM_siglongjmp

#else
# error Unsupported architecture
#endif

#ifdef __linux__
.section .note.GNU-stack, "", @progbits
#endif


================================================
FILE: stms/tinystm/abi/intel/libitm.h
================================================
/*
 * File:
 *   libitm.h
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * Description:
 *   ABI for tinySTM.
 *
 * Copyright (c) 2007-2014.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * This program has a dual license and can also be distributed
 * under the terms of the MIT license.
 */

#ifndef _LIBITM_H_
#define _LIBITM_H_

#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */

#include <stdint.h>
#include <stdbool.h>
#ifdef __SSE__
# include <xmmintrin.h>
#endif

/* ################################################################### *
 * DEFINES
 * ################################################################### */
#define _ITM_VERSION_NO_STR "1.0.4"
#define _ITM_VERSION_NO 104

#if defined(__i386__)
# define _ITM_CALL_CONVENTION __attribute__((regparm(2)))
#else
# define _ITM_CALL_CONVENTION
#endif

#define _ITM_noTransactionId 1		/* Id for non-transactional code. */


#define _ITM_TRANSACTION_PURE __attribute__((tm_pure))

/* ################################################################### *
 * TYPES
 * ################################################################### */

typedef void *_ITM_transaction;

typedef void (*_ITM_userUndoFunction)(void *);
typedef void (*_ITM_userCommitFunction)(void *);

typedef uint32_t _ITM_transactionId;

typedef enum
{
  outsideTransaction = 0,
  inRetryableTransaction,
  inIrrevocableTransaction
} _ITM_howExecuting;

struct _ITM_srcLocationS
{
  int32_t reserved_1;
  int32_t flags;
  int32_t reserved_2;
  int32_t reserved_3;
  const char *psource;
};

typedef struct _ITM_srcLocationS _ITM_srcLocation;

typedef enum {
  pr_instrumentedCode = 0x0001,
  pr_uninstrumentedCode = 0x0002,
  pr_multiwayCode = pr_instrumentedCode | pr_uninstrumentedCode,
  pr_hasNoXMMUpdate = 0x0004,
  pr_hasNoAbort = 0x0008,
  pr_hasNoRetry = 0x0010,
  pr_hasNoIrrevocable = 0x0020,
  pr_doesGoIrrevocable = 0x0040,
  pr_hasNoSimpleReads = 0x0080,
  pr_aWBarriersOmitted = 0x0100,
  pr_RaRBarriersOmitted = 0x0200,
  pr_undoLogCode = 0x0400,
  pr_preferUninstrumented = 0x0800,
  pr_exceptionBlock = 0x1000,
  pr_hasElse = 0x2000,
  pr_readOnly = 0x4000 /* GNU gcc specific */
} _ITM_codeProperties;

typedef enum {
  a_runInstrumentedCode = 0x01,
  a_runUninstrumentedCode = 0x02,
  a_saveLiveVariables = 0x04,
  a_restoreLiveVariables = 0x08,
  a_abortTransaction = 0x10,
} _ITM_actions;

typedef enum {
  modeSerialIrrevocable,
  modeObstinate,
  modeOptimistic,
  modePessimistic,
} _ITM_transactionState;

typedef enum {
  unknown = 0,
  userAbort = 1,
  userRetry = 2,
  TMConflict= 4,
  exceptionBlockAbort = 8
} _ITM_abortReason;


/* ################################################################### *
 * FUNCTIONS
 * ################################################################### */

extern _ITM_TRANSACTION_PURE
_ITM_transaction * _ITM_CALL_CONVENTION _ITM_getTransaction(void);

extern _ITM_TRANSACTION_PURE
_ITM_howExecuting _ITM_CALL_CONVENTION _ITM_inTransaction(_ITM_transaction *);

extern _ITM_TRANSACTION_PURE
int _ITM_CALL_CONVENTION _ITM_getThreadnum(void);

extern _ITM_TRANSACTION_PURE
void _ITM_CALL_CONVENTION _ITM_addUserCommitAction(_ITM_transaction *, 
                             _ITM_userCommitFunction __commit,
                             _ITM_transactionId resumingTransactionId,
                             void *__arg);

extern _ITM_TRANSACTION_PURE
void _ITM_CALL_CONVENTION _ITM_addUserUndoAction(_ITM_transaction *, 
                             const _ITM_userUndoFunction __undo, void * __arg);

extern _ITM_TRANSACTION_PURE
_ITM_transactionId _ITM_CALL_CONVENTION _ITM_getTransactionId(_ITM_transaction *);

extern _ITM_TRANSACTION_PURE
void _ITM_CALL_CONVENTION _ITM_dropReferences(_ITM_transaction *, 
                             const void *__start, size_t __size);

extern _ITM_TRANSACTION_PURE
void _ITM_CALL_CONVENTION _ITM_userError(const char *errString, int exitCode);

extern const char * _ITM_CALL_CONVENTION _ITM_libraryVersion(void);

extern int _ITM_CALL_CONVENTION _ITM_versionCompatible(int version);


extern int _ITM_CALL_CONVENTION _ITM_initializeThread(void);

extern void _ITM_CALL_CONVENTION _ITM_finalizeThread(void);

extern void _ITM_CALL_CONVENTION _ITM_finalizeProcess(void);

extern int _ITM_CALL_CONVENTION _ITM_initializeProcess(void);

extern void _ITM_CALL_CONVENTION _ITM_error(const _ITM_srcLocation *__src,
                             int errorCode);

extern uint32_t _ITM_CALL_CONVENTION _ITM_beginTransaction(_ITM_transaction *, 
                             uint32_t __properties,
                             const _ITM_srcLocation *__src)
                             __attribute__((returns_twice));

extern void _ITM_CALL_CONVENTION _ITM_commitTransaction(_ITM_transaction *, 
                             const _ITM_srcLocation *__src);


extern bool _ITM_CALL_CONVENTION _ITM_tryCommitTransaction(_ITM_transaction *, 
                             const _ITM_srcLocation *__src);

extern void _ITM_CALL_CONVENTION _ITM_commitTransactionToId(_ITM_transaction *, 
                             const _ITM_transactionId tid,
                             const _ITM_srcLocation *__src);

extern void _ITM_CALL_CONVENTION _ITM_abortTransaction(_ITM_transaction *, 
                             _ITM_abortReason __reason,
                             const _ITM_srcLocation *__src);

extern void _ITM_CALL_CONVENTION _ITM_rollbackTransaction(_ITM_transaction *, 
                             const _ITM_srcLocation *__src);

extern void _ITM_CALL_CONVENTION _ITM_registerThrownObject(_ITM_transaction *, 
                             const void *__obj,
                             size_t __size);

extern void _ITM_CALL_CONVENTION _ITM_changeTransactionMode(_ITM_transaction *, 
                             _ITM_transactionState __mode,
                             const _ITM_srcLocation *__loc);


/*** Loads ***/

extern uint8_t _ITM_CALL_CONVENTION _ITM_RU1(_ITM_transaction *, const uint8_t *);
extern uint8_t _ITM_CALL_CONVENTION _ITM_RaRU1(_ITM_transaction *, const uint8_t *);
extern uint8_t _ITM_CALL_CONVENTION _ITM_RaWU1(_ITM_transaction *, const uint8_t *);
extern uint8_t _ITM_CALL_CONVENTION _ITM_RfWU1(_ITM_transaction *, const uint8_t *);

extern uint16_t _ITM_CALL_CONVENTION _ITM_RU2(_ITM_transaction *, const uint16_t *);
extern uint16_t _ITM_CALL_CONVENTION _ITM_RaRU2(_ITM_transaction *, const uint16_t *);
extern uint16_t _ITM_CALL_CONVENTION _ITM_RaWU2(_ITM_transaction *, const uint16_t *);
extern uint16_t _ITM_CALL_CONVENTION _ITM_RfWU2(_ITM_transaction *, const uint16_t *);

extern uint32_t _ITM_CALL_CONVENTION _ITM_RU4(_ITM_transaction *, const uint32_t *);
extern uint32_t _ITM_CALL_CONVENTION _ITM_RaRU4(_ITM_transaction *, const uint32_t *);
extern uint32_t _ITM_CALL_CONVENTION _ITM_RaWU4(_ITM_transaction *, const uint32_t *);
extern uint32_t _ITM_CALL_CONVENTION _ITM_RfWU4(_ITM_transaction *, const uint32_t *);

extern uint64_t _ITM_CALL_CONVENTION _ITM_RU8(_ITM_transaction *, const uint64_t *);
extern uint64_t _ITM_CALL_CONVENTION _ITM_RaRU8(_ITM_transaction *, const uint64_t *);
extern uint64_t _ITM_CALL_CONVENTION _ITM_RaWU8(_ITM_transaction *, const uint64_t *);
extern uint64_t _ITM_CALL_CONVENTION _ITM_RfWU8(_ITM_transaction *, const uint64_t *);

extern float _ITM_CALL_CONVENTION _ITM_RF(_ITM_transaction *, const float *);
extern float _ITM_CALL_CONVENTION _ITM_RaRF(_ITM_transaction *, const float *);
extern float _ITM_CALL_CONVENTION _ITM_RaWF(_ITM_transaction *, const float *);
extern float _ITM_CALL_CONVENTION _ITM_RfWF(_ITM_transaction *, const float *);

extern double _ITM_CALL_CONVENTION _ITM_RD(_ITM_transaction *, const double *);
extern double _ITM_CALL_CONVENTION _ITM_RaRD(_ITM_transaction *, const double *);
extern double _ITM_CALL_CONVENTION _ITM_RaWD(_ITM_transaction *, const double *);
extern double _ITM_CALL_CONVENTION _ITM_RfWD(_ITM_transaction *, const double *);

#ifdef __SSE__
extern __m64 _ITM_CALL_CONVENTION _ITM_RM64(_ITM_transaction *, const __m64 *);
extern __m64 _ITM_CALL_CONVENTION _ITM_RaRM64(_ITM_transaction *, const __m64 *);
extern __m64 _ITM_CALL_CONVENTION _ITM_RaWM64(_ITM_transaction *, const __m64 *);
extern __m64 _ITM_CALL_CONVENTION _ITM_RfWM64(_ITM_transaction *, const __m64 *);

extern __m128 _ITM_CALL_CONVENTION _ITM_RM128(_ITM_transaction *, const __m128 *);
extern __m128 _ITM_CALL_CONVENTION _ITM_RaRM128(_ITM_transaction *, const __m128 *);
extern __m128 _ITM_CALL_CONVENTION _ITM_RaWM128(_ITM_transaction *, const __m128 *);
extern __m128 _ITM_CALL_CONVENTION _ITM_RfWM128(_ITM_transaction *, const __m128 *);
#endif /* __SSE__ */

extern float _Complex _ITM_CALL_CONVENTION _ITM_RCF(_ITM_transaction *, const float _Complex *);
extern float _Complex _ITM_CALL_CONVENTION _ITM_RaRCF(_ITM_transaction *, const float _Complex *);
extern float _Complex _ITM_CALL_CONVENTION _ITM_RaWCF(_ITM_transaction *, const float _Complex *);
extern float _Complex _ITM_CALL_CONVENTION _ITM_RfWCF(_ITM_transaction *, const float _Complex *);

extern double _Complex _ITM_CALL_CONVENTION _ITM_RCD(_ITM_transaction *, const double _Complex *);
extern double _Complex _ITM_CALL_CONVENTION _ITM_RaRCD(_ITM_transaction *, const double _Complex *);
extern double _Complex _ITM_CALL_CONVENTION _ITM_RaWCD(_ITM_transaction *, const double _Complex *);
extern double _Complex _ITM_CALL_CONVENTION _ITM_RfWCD(_ITM_transaction *, const double _Complex *);

extern long double _Complex _ITM_CALL_CONVENTION _ITM_RCE(_ITM_transaction *, const long double _Complex *);
extern long double _Complex _ITM_CALL_CONVENTION _ITM_RaRCE(_ITM_transaction *, const long double _Complex *);
extern long double _Complex _ITM_CALL_CONVENTION _ITM_RaWCE(_ITM_transaction *, const long double _Complex *);
extern long double _Complex _ITM_CALL_CONVENTION _ITM_RfWCE(_ITM_transaction *, const long double _Complex *);


/*** Stores ***/

extern void _ITM_CALL_CONVENTION _ITM_WU1(_ITM_transaction *, const uint8_t *, uint8_t);
extern void _ITM_CALL_CONVENTION _ITM_WaRU1(_ITM_transaction *, const uint8_t *, uint8_t);
extern void _ITM_CALL_CONVENTION _ITM_WaWU1(_ITM_transaction *, const uint8_t *, uint8_t);

extern void _ITM_CALL_CONVENTION _ITM_WU2(_ITM_transaction *, const uint16_t *, uint16_t);
extern void _ITM_CALL_CONVENTION _ITM_WaRU2(_ITM_transaction *, const uint16_t *, uint16_t);
extern void _ITM_CALL_CONVENTION _ITM_WaWU2(_ITM_transaction *, const uint16_t *, uint16_t);

extern void _ITM_CALL_CONVENTION _ITM_WU4(_ITM_transaction *, const uint32_t *, uint32_t);
extern void _ITM_CALL_CONVENTION _ITM_WaRU4(_ITM_transaction *, const uint32_t *, uint32_t);
extern void _ITM_CALL_CONVENTION _ITM_WaWU4(_ITM_transaction *, const uint32_t *, uint32_t);

extern void _ITM_CALL_CONVENTION _ITM_WU8(_ITM_transaction *, const uint64_t *, uint64_t);
extern void _ITM_CALL_CONVENTION _ITM_WaRU8(_ITM_transaction *, const uint64_t *, uint64_t);
extern void _ITM_CALL_CONVENTION _ITM_WaWU8(_ITM_transaction *, const uint64_t *, uint64_t);

extern void _ITM_CALL_CONVENTION _ITM_WF(_ITM_transaction *, const float *, float);
extern void _ITM_CALL_CONVENTION _ITM_WaRF(_ITM_transaction *, const float *, float);
extern void _ITM_CALL_CONVENTION _ITM_WaWF(_ITM_transaction *, const float *, float);

extern void _ITM_CALL_CONVENTION _ITM_WD(_ITM_transaction *, const double *, double);
extern void _ITM_CALL_CONVENTION _ITM_WaRD(_ITM_transaction *, const double *, double);
extern void _ITM_CALL_CONVENTION _ITM_WaWD(_ITM_transaction *, const double *, double);

#ifdef __SSE__
extern void _ITM_CALL_CONVENTION _ITM_WM64(_ITM_transaction *, const __m64 *, __m64);
extern void _ITM_CALL_CONVENTION _ITM_WaRM64(_ITM_transaction *, const __m64 *, __m64);
extern void _ITM_CALL_CONVENTION _ITM_WaWM64(_ITM_transaction *, const __m64 *, __m64);

extern void _ITM_CALL_CONVENTION _ITM_WM128(_ITM_transaction *, const __m128 *, __m128);
extern void _ITM_CALL_CONVENTION _ITM_WaRM128(_ITM_transaction *, const __m128 *, __m128);
extern void _ITM_CALL_CONVENTION _ITM_WaWM128(_ITM_transaction *, const __m128 *, __m128);
#endif /* __SSE__ */

extern void _ITM_CALL_CONVENTION _ITM_WCF(_ITM_transaction *, const float _Complex *, float _Complex);
extern void _ITM_CALL_CONVENTION _ITM_WaRCF(_ITM_transaction *, const float _Complex *, float _Complex);
extern void _ITM_CALL_CONVENTION _ITM_WaWCF(_ITM_transaction *, const float _Complex *, float _Complex);

extern void _ITM_CALL_CONVENTION _ITM_WCD(_ITM_transaction *, const double _Complex *, double _Complex);
extern void _ITM_CALL_CONVENTION _ITM_WaRCD(_ITM_transaction *, const double _Complex *, double _Complex);
extern void _ITM_CALL_CONVENTION _ITM_WaWCD(_ITM_transaction *, const double _Complex *, double _Complex);

extern void _ITM_CALL_CONVENTION _ITM_WCE(_ITM_transaction *, const long double _Complex *, long double _Complex);
extern void _ITM_CALL_CONVENTION _ITM_WaRCE(_ITM_transaction *, const long double _Complex *, long double _Complex);
extern void _ITM_CALL_CONVENTION _ITM_WaWCE(_ITM_transaction *, const long double _Complex *, long double _Complex);


/*** Logging functions ***/

extern void _ITM_CALL_CONVENTION _ITM_LU1(_ITM_transaction *, const uint8_t *);
extern void _ITM_CALL_CONVENTION _ITM_LU2(_ITM_transaction *, const uint16_t *);
extern void _ITM_CALL_CONVENTION _ITM_LU4(_ITM_transaction *, const uint32_t *);
extern void _ITM_CALL_CONVENTION _ITM_LU8(_ITM_transaction *, const uint64_t *);
extern void _ITM_CALL_CONVENTION _ITM_LF(_ITM_transaction *, const float *);
extern void _ITM_CALL_CONVENTION _ITM_LD(_ITM_transaction *, const double *);
extern void _ITM_CALL_CONVENTION _ITM_LE(_ITM_transaction *, const long double *);
extern void _ITM_CALL_CONVENTION _ITM_LM64(_ITM_transaction *, const __m64 *);
extern void _ITM_CALL_CONVENTION _ITM_LM128(_ITM_transaction *, const __m128 *);
extern void _ITM_CALL_CONVENTION _ITM_LCF(_ITM_transaction *, const float _Complex *);
extern void _ITM_CALL_CONVENTION _ITM_LCD(_ITM_transaction *, const double _Complex *);
extern void _ITM_CALL_CONVENTION _ITM_LCE(_ITM_transaction *, const long double _Complex *);
extern void _ITM_CALL_CONVENTION _ITM_LB(_ITM_transaction *, const void *, size_t);


/*** memcpy functions ***/

extern void _ITM_CALL_CONVENTION _ITM_memcpyRnWt(_ITM_transaction *, void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memcpyRnWtaR(_ITM_transaction *, void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memcpyRnWtaW(_ITM_transaction *, void *, const void *, size_t);

extern void _ITM_CALL_CONVENTION _ITM_memcpyRtWn(_ITM_transaction *, void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memcpyRtaRWn(_ITM_transaction *, void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memcpyRtaWWn(_ITM_transaction *, void *, const void *, size_t);

extern void _ITM_CALL_CONVENTION _ITM_memcpyRtWt(_ITM_transaction *, void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memcpyRtWtaR(_ITM_transaction *, void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memcpyRtWtaW(_ITM_transaction *, void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memcpyRtaRWt(_ITM_transaction *, void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memcpyRtaRWtaR(_ITM_transaction *, void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memcpyRtaRWtaW(_ITM_transaction *, void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memcpyRtaWWt(_ITM_transaction *, void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memcpyRtaWWtaR(_ITM_transaction *, void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memcpyRtaWWtaW(_ITM_transaction *, void *, const void *, size_t);


/*** memset functions ***/

extern void _ITM_CALL_CONVENTION _ITM_memsetW(_ITM_transaction *, void *, int, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memsetWaR(_ITM_transaction *, void *, int, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memsetWaW(_ITM_transaction *, void *, int, size_t);


/*** memmove functions ***/

extern void _ITM_CALL_CONVENTION _ITM_memmoveRnWt(_ITM_transaction *, void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memmoveRnWtaR(_ITM_transaction *, void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memmoveRnWtaW(_ITM_transaction *, void *, const void *, size_t);

extern void _ITM_CALL_CONVENTION _ITM_memmoveRtWn(_ITM_transaction *, void *, const void *, size_t); 
extern void _ITM_CALL_CONVENTION _ITM_memmoveRtaRWn(_ITM_transaction *, void *, const void *, size_t); 
extern void _ITM_CALL_CONVENTION _ITM_memmoveRtaWWn(_ITM_transaction *, void *, const void *, size_t); 

extern void _ITM_CALL_CONVENTION _ITM_memmoveRtWt(_ITM_transaction *, void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memmoveRtWtaR(_ITM_transaction *, void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memmoveRtWtaW(_ITM_transaction *, void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memmoveRtaRWt(_ITM_transaction *, void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memmoveRtaRWtaR(_ITM_transaction *, void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memmoveRtaRWtaW(_ITM_transaction *, void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memmoveRtaWWt(_ITM_transaction *, void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memmoveRtaWWtaR(_ITM_transaction *, void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memmoveRtaWWtaW(_ITM_transaction *, void *, const void *, size_t);


#ifdef __cplusplus
} /* extern "C" */
#endif /* __cplusplus */

#endif /* _LIBITM_H_ */


================================================
FILE: stms/tinystm/abi/intel/tm_macros.h
================================================
/*
 * File:
 *   tm_macros.h
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * Description:
 *   Defines macros for transactional operations.
 *
 * Copyright (c) 2007-2014.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * This program has a dual license and can also be distributed
 * under the terms of the MIT license.
 */

#ifndef _TM_MACROS_H_
# define _TM_MACROS_H_
/* TODO check for exit() and perror() */
/* TODO check function pointer in rbtree */
/* 3.0 #define __INTEL_COMPILER_BUILD_DATE 20081204 */
/* 4.0 #define __INTEL_COMPILER_BUILD_DATE 20100806 */
# if __INTEL_COMPILER_BUILD_DATE < 20100806
#  define TM_START(id,ro)                   __tm_atomic {
# else /* __INTEL_COMPILER_BUILD_DATE >= 20100806 */
#  define TM_START(id,ro)                   __transaction [[atomic]] {
# endif /* __INTEL_COMPILER_BUILD_DATE >= 20100806 */
# define TM_LOAD(x)                         *x
# define TM_STORE(x,y)                      *x=y
# define TM_COMMIT                          }
# define TM_MALLOC(size)                    malloc(size)
# define TM_FREE(addr)                      free(addr)
# define TM_FREE2(addr, size)               free(addr)

# define TM_INIT
# define TM_EXIT
# define TM_INIT_THREAD
# define TM_EXIT_THREAD

/* Define Annotations */
# if __INTEL_COMPILER_BUILD_DATE < 20100806
#  define TM_PURE                           __attribute__((tm_pure))
#  define TM_SAFE                           __attribute__((tm_callable))
# else /* __INTEL_COMPILER_BUILD_DATE >= 20100806 */
#  define TM_PURE                           [[transaction_pure]]
#  define TM_SAFE                           [[transaction_safe]]

/* error: non [[transaction_safe]] function "malloc" called inside [[transaction_safe]] routine */
TM_SAFE
void *malloc(size_t);

TM_SAFE
void free(void *);

# endif /* __INTEL_COMPILER_BUILD_DATE >= 20100806 */

#endif /* _TM_MACROS_H_ */


================================================
FILE: stms/tinystm/abi/libitm.h.tpl.cpp
================================================
/*** Loads ***/

extern uint8_t _ITM_CALL_CONVENTION _ITM_RU1(TX_ARGS const uint8_t *);
extern uint8_t _ITM_CALL_CONVENTION _ITM_RaRU1(TX_ARGS const uint8_t *);
extern uint8_t _ITM_CALL_CONVENTION _ITM_RaWU1(TX_ARGS const uint8_t *);
extern uint8_t _ITM_CALL_CONVENTION _ITM_RfWU1(TX_ARGS const uint8_t *);

extern uint16_t _ITM_CALL_CONVENTION _ITM_RU2(TX_ARGS const uint16_t *);
extern uint16_t _ITM_CALL_CONVENTION _ITM_RaRU2(TX_ARGS const uint16_t *);
extern uint16_t _ITM_CALL_CONVENTION _ITM_RaWU2(TX_ARGS const uint16_t *);
extern uint16_t _ITM_CALL_CONVENTION _ITM_RfWU2(TX_ARGS const uint16_t *);

extern uint32_t _ITM_CALL_CONVENTION _ITM_RU4(TX_ARGS const uint32_t *);
extern uint32_t _ITM_CALL_CONVENTION _ITM_RaRU4(TX_ARGS const uint32_t *);
extern uint32_t _ITM_CALL_CONVENTION _ITM_RaWU4(TX_ARGS const uint32_t *);
extern uint32_t _ITM_CALL_CONVENTION _ITM_RfWU4(TX_ARGS const uint32_t *);

extern uint64_t _ITM_CALL_CONVENTION _ITM_RU8(TX_ARGS const uint64_t *);
extern uint64_t _ITM_CALL_CONVENTION _ITM_RaRU8(TX_ARGS const uint64_t *);
extern uint64_t _ITM_CALL_CONVENTION _ITM_RaWU8(TX_ARGS const uint64_t *);
extern uint64_t _ITM_CALL_CONVENTION _ITM_RfWU8(TX_ARGS const uint64_t *);

extern float _ITM_CALL_CONVENTION _ITM_RF(TX_ARGS const float *);
extern float _ITM_CALL_CONVENTION _ITM_RaRF(TX_ARGS const float *);
extern float _ITM_CALL_CONVENTION _ITM_RaWF(TX_ARGS const float *);
extern float _ITM_CALL_CONVENTION _ITM_RfWF(TX_ARGS const float *);

extern double _ITM_CALL_CONVENTION _ITM_RD(TX_ARGS const double *);
extern double _ITM_CALL_CONVENTION _ITM_RaRD(TX_ARGS const double *);
extern double _ITM_CALL_CONVENTION _ITM_RaWD(TX_ARGS const double *);
extern double _ITM_CALL_CONVENTION _ITM_RfWD(TX_ARGS const double *);

#ifdef __SSE__
extern __m64 _ITM_CALL_CONVENTION _ITM_RM64(TX_ARGS const __m64 *);
extern __m64 _ITM_CALL_CONVENTION _ITM_RaRM64(TX_ARGS const __m64 *);
extern __m64 _ITM_CALL_CONVENTION _ITM_RaWM64(TX_ARGS const __m64 *);
extern __m64 _ITM_CALL_CONVENTION _ITM_RfWM64(TX_ARGS const __m64 *);

extern __m128 _ITM_CALL_CONVENTION _ITM_RM128(TX_ARGS const __m128 *);
extern __m128 _ITM_CALL_CONVENTION _ITM_RaRM128(TX_ARGS const __m128 *);
extern __m128 _ITM_CALL_CONVENTION _ITM_RaWM128(TX_ARGS const __m128 *);
extern __m128 _ITM_CALL_CONVENTION _ITM_RfWM128(TX_ARGS const __m128 *);
#endif /* __SSE__ */

extern float _Complex _ITM_CALL_CONVENTION _ITM_RCF(TX_ARGS const float _Complex *);
extern float _Complex _ITM_CALL_CONVENTION _ITM_RaRCF(TX_ARGS const float _Complex *);
extern float _Complex _ITM_CALL_CONVENTION _ITM_RaWCF(TX_ARGS const float _Complex *);
extern float _Complex _ITM_CALL_CONVENTION _ITM_RfWCF(TX_ARGS const float _Complex *);

extern double _Complex _ITM_CALL_CONVENTION _ITM_RCD(TX_ARGS const double _Complex *);
extern double _Complex _ITM_CALL_CONVENTION _ITM_RaRCD(TX_ARGS const double _Complex *);
extern double _Complex _ITM_CALL_CONVENTION _ITM_RaWCD(TX_ARGS const double _Complex *);
extern double _Complex _ITM_CALL_CONVENTION _ITM_RfWCD(TX_ARGS const double _Complex *);

extern long double _Complex _ITM_CALL_CONVENTION _ITM_RCE(TX_ARGS const long double _Complex *);
extern long double _Complex _ITM_CALL_CONVENTION _ITM_RaRCE(TX_ARGS const long double _Complex *);
extern long double _Complex _ITM_CALL_CONVENTION _ITM_RaWCE(TX_ARGS const long double _Complex *);
extern long double _Complex _ITM_CALL_CONVENTION _ITM_RfWCE(TX_ARGS const long double _Complex *);


/*** Stores ***/

extern void _ITM_CALL_CONVENTION _ITM_WU1(TX_ARGS const uint8_t *, uint8_t);
extern void _ITM_CALL_CONVENTION _ITM_WaRU1(TX_ARGS const uint8_t *, uint8_t);
extern void _ITM_CALL_CONVENTION _ITM_WaWU1(TX_ARGS const uint8_t *, uint8_t);

extern void _ITM_CALL_CONVENTION _ITM_WU2(TX_ARGS const uint16_t *, uint16_t);
extern void _ITM_CALL_CONVENTION _ITM_WaRU2(TX_ARGS const uint16_t *, uint16_t);
extern void _ITM_CALL_CONVENTION _ITM_WaWU2(TX_ARGS const uint16_t *, uint16_t);

extern void _ITM_CALL_CONVENTION _ITM_WU4(TX_ARGS const uint32_t *, uint32_t);
extern void _ITM_CALL_CONVENTION _ITM_WaRU4(TX_ARGS const uint32_t *, uint32_t);
extern void _ITM_CALL_CONVENTION _ITM_WaWU4(TX_ARGS const uint32_t *, uint32_t);

extern void _ITM_CALL_CONVENTION _ITM_WU8(TX_ARGS const uint64_t *, uint64_t);
extern void _ITM_CALL_CONVENTION _ITM_WaRU8(TX_ARGS const uint64_t *, uint64_t);
extern void _ITM_CALL_CONVENTION _ITM_WaWU8(TX_ARGS const uint64_t *, uint64_t);

extern void _ITM_CALL_CONVENTION _ITM_WF(TX_ARGS const float *, float);
extern void _ITM_CALL_CONVENTION _ITM_WaRF(TX_ARGS const float *, float);
extern void _ITM_CALL_CONVENTION _ITM_WaWF(TX_ARGS const float *, float);

extern void _ITM_CALL_CONVENTION _ITM_WD(TX_ARGS const double *, double);
extern void _ITM_CALL_CONVENTION _ITM_WaRD(TX_ARGS const double *, double);
extern void _ITM_CALL_CONVENTION _ITM_WaWD(TX_ARGS const double *, double);

#ifdef __SSE__
extern void _ITM_CALL_CONVENTION _ITM_WM64(TX_ARGS const __m64 *, __m64);
extern void _ITM_CALL_CONVENTION _ITM_WaRM64(TX_ARGS const __m64 *, __m64);
extern void _ITM_CALL_CONVENTION _ITM_WaWM64(TX_ARGS const __m64 *, __m64);

extern void _ITM_CALL_CONVENTION _ITM_WM128(TX_ARGS const __m128 *, __m128);
extern void _ITM_CALL_CONVENTION _ITM_WaRM128(TX_ARGS const __m128 *, __m128);
extern void _ITM_CALL_CONVENTION _ITM_WaWM128(TX_ARGS const __m128 *, __m128);
#endif /* __SSE__ */

extern void _ITM_CALL_CONVENTION _ITM_WCF(TX_ARGS const float _Complex *, float _Complex);
extern void _ITM_CALL_CONVENTION _ITM_WaRCF(TX_ARGS const float _Complex *, float _Complex);
extern void _ITM_CALL_CONVENTION _ITM_WaWCF(TX_ARGS const float _Complex *, float _Complex);

extern void _ITM_CALL_CONVENTION _ITM_WCD(TX_ARGS const double _Complex *, double _Complex);
extern void _ITM_CALL_CONVENTION _ITM_WaRCD(TX_ARGS const double _Complex *, double _Complex);
extern void _ITM_CALL_CONVENTION _ITM_WaWCD(TX_ARGS const double _Complex *, double _Complex);

extern void _ITM_CALL_CONVENTION _ITM_WCE(TX_ARGS const long double _Complex *, long double _Complex);
extern void _ITM_CALL_CONVENTION _ITM_WaRCE(TX_ARGS const long double _Complex *, long double _Complex);
extern void _ITM_CALL_CONVENTION _ITM_WaWCE(TX_ARGS const long double _Complex *, long double _Complex);


/*** Logging functions ***/

extern void _ITM_CALL_CONVENTION _ITM_LU1(TX_ARGS const uint8_t *);
extern void _ITM_CALL_CONVENTION _ITM_LU2(TX_ARGS const uint16_t *);
extern void _ITM_CALL_CONVENTION _ITM_LU4(TX_ARGS const uint32_t *);
extern void _ITM_CALL_CONVENTION _ITM_LU8(TX_ARGS const uint64_t *);
extern void _ITM_CALL_CONVENTION _ITM_LF(TX_ARGS const float *);
extern void _ITM_CALL_CONVENTION _ITM_LD(TX_ARGS const double *);
extern void _ITM_CALL_CONVENTION _ITM_LE(TX_ARGS const long double *);
extern void _ITM_CALL_CONVENTION _ITM_LM64(TX_ARGS const __m64 *);
extern void _ITM_CALL_CONVENTION _ITM_LM128(TX_ARGS const __m128 *);
extern void _ITM_CALL_CONVENTION _ITM_LCF(TX_ARGS const float _Complex *);
extern void _ITM_CALL_CONVENTION _ITM_LCD(TX_ARGS const double _Complex *);
extern void _ITM_CALL_CONVENTION _ITM_LCE(TX_ARGS const long double _Complex *);
extern void _ITM_CALL_CONVENTION _ITM_LB(TX_ARGS const void *, size_t);


/*** memcpy functions ***/

extern void _ITM_CALL_CONVENTION _ITM_memcpyRnWt(TX_ARGS void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memcpyRnWtaR(TX_ARGS void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memcpyRnWtaW(TX_ARGS void *, const void *, size_t);

extern void _ITM_CALL_CONVENTION _ITM_memcpyRtWn(TX_ARGS void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memcpyRtaRWn(TX_ARGS void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memcpyRtaWWn(TX_ARGS void *, const void *, size_t);

extern void _ITM_CALL_CONVENTION _ITM_memcpyRtWt(TX_ARGS void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memcpyRtWtaR(TX_ARGS void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memcpyRtWtaW(TX_ARGS void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memcpyRtaRWt(TX_ARGS void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memcpyRtaRWtaR(TX_ARGS void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memcpyRtaRWtaW(TX_ARGS void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memcpyRtaWWt(TX_ARGS void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memcpyRtaWWtaR(TX_ARGS void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memcpyRtaWWtaW(TX_ARGS void *, const void *, size_t);


/*** memset functions ***/

extern void _ITM_CALL_CONVENTION _ITM_memsetW(TX_ARGS void *, int, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memsetWaR(TX_ARGS void *, int, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memsetWaW(TX_ARGS void *, int, size_t);


/*** memmove functions ***/

extern void _ITM_CALL_CONVENTION _ITM_memmoveRnWt(TX_ARGS void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memmoveRnWtaR(TX_ARGS void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memmoveRnWtaW(TX_ARGS void *, const void *, size_t);

extern void _ITM_CALL_CONVENTION _ITM_memmoveRtWn(TX_ARGS void *, const void *, size_t); 
extern void _ITM_CALL_CONVENTION _ITM_memmoveRtaRWn(TX_ARGS void *, const void *, size_t); 
extern void _ITM_CALL_CONVENTION _ITM_memmoveRtaWWn(TX_ARGS void *, const void *, size_t); 

extern void _ITM_CALL_CONVENTION _ITM_memmoveRtWt(TX_ARGS void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memmoveRtWtaR(TX_ARGS void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memmoveRtWtaW(TX_ARGS void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memmoveRtaRWt(TX_ARGS void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memmoveRtaRWtaR(TX_ARGS void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memmoveRtaRWtaW(TX_ARGS void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memmoveRtaWWt(TX_ARGS void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memmoveRtaWWtaR(TX_ARGS void *, const void *, size_t);
extern void _ITM_CALL_CONVENTION _ITM_memmoveRtaWWtaW(TX_ARGS void *, const void *, size_t);


================================================
FILE: stms/tinystm/abi/libitm.h.tpl.footer
================================================

#ifdef __cplusplus
} /* extern "C" */
#endif /* __cplusplus */

#endif /* _LIBITM_H_ */


================================================
FILE: stms/tinystm/abi/libitm.h.tpl.header
================================================
/*
 * File:
 *   libitm.h
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * Description:
 *   ABI for tinySTM.
 *
 * Copyright (c) 2007-2014.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * This program has a dual license and can also be distributed
 * under the terms of the MIT license.
 */

#ifndef _LIBITM_H_
#define _LIBITM_H_

#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */

#include <stdint.h>
#include <stdbool.h>
#ifdef __SSE__
# include <xmmintrin.h>
#endif

/* ################################################################### *
 * DEFINES
 * ################################################################### */
#define _ITM_VERSION_NO_STR "1.0.4"
#define _ITM_VERSION_NO 104

#if defined(__i386__)
# define _ITM_CALL_CONVENTION __attribute__((regparm(2)))
#else
# define _ITM_CALL_CONVENTION
#endif

#define _ITM_noTransactionId 1		/* Id for non-transactional code. */


================================================
FILE: stms/tinystm/abi/libitm.h.tpl.unifdef
================================================
#if defined(TM_GCC) || defined(TM_DTMC)
#define _ITM_TRANSACTION_PURE __attribute__((transaction_pure))
#elif defined(TM_INTEL)
#define _ITM_TRANSACTION_PURE __attribute__((tm_pure))
/* TODO [[transaction_pure]] */
#endif

/* ################################################################### *
 * TYPES
 * ################################################################### */

typedef void *_ITM_transaction;

typedef void (*_ITM_userUndoFunction)(void *);
typedef void (*_ITM_userCommitFunction)(void *);

typedef uint32_t _ITM_transactionId;

typedef enum
{
  outsideTransaction = 0,
  inRetryableTransaction,
  inIrrevocableTransaction
} _ITM_howExecuting;

struct _ITM_srcLocationS
{
  int32_t reserved_1;
  int32_t flags;
  int32_t reserved_2;
  int32_t reserved_3;
  const char *psource;
};

typedef struct _ITM_srcLocationS _ITM_srcLocation;

typedef enum {
  pr_instrumentedCode = 0x0001,
  pr_uninstrumentedCode = 0x0002,
  pr_multiwayCode = pr_instrumentedCode | pr_uninstrumentedCode,
  pr_hasNoXMMUpdate = 0x0004,
  pr_hasNoAbort = 0x0008,
  pr_hasNoRetry = 0x0010,
  pr_hasNoIrrevocable = 0x0020,
  pr_doesGoIrrevocable = 0x0040,
  pr_hasNoSimpleReads = 0x0080,
  pr_aWBarriersOmitted = 0x0100,
  pr_RaRBarriersOmitted = 0x0200,
  pr_undoLogCode = 0x0400,
  pr_preferUninstrumented = 0x0800,
  pr_exceptionBlock = 0x1000,
  pr_hasElse = 0x2000,
  pr_readOnly = 0x4000 /* GNU gcc specific */
} _ITM_codeProperties;

typedef enum {
  a_runInstrumentedCode = 0x01,
  a_runUninstrumentedCode = 0x02,
  a_saveLiveVariables = 0x04,
  a_restoreLiveVariables = 0x08,
  a_abortTransaction = 0x10,
} _ITM_actions;

typedef enum {
  modeSerialIrrevocable,
  modeObstinate,
  modeOptimistic,
  modePessimistic,
} _ITM_transactionState;

typedef enum {
  unknown = 0,
  userAbort = 1,
  userRetry = 2,
  TMConflict= 4,
  exceptionBlockAbort = 8
} _ITM_abortReason;


/* ################################################################### *
 * FUNCTIONS
 * ################################################################### */

extern _ITM_TRANSACTION_PURE
_ITM_transaction * _ITM_CALL_CONVENTION _ITM_getTransaction(void);

extern _ITM_TRANSACTION_PURE
_ITM_howExecuting _ITM_CALL_CONVENTION _ITM_inTransaction(TX_ARG);

extern _ITM_TRANSACTION_PURE
int _ITM_CALL_CONVENTION _ITM_getThreadnum(void);

extern _ITM_TRANSACTION_PURE
void _ITM_CALL_CONVENTION _ITM_addUserCommitAction(TX_ARGS 
                             _ITM_userCommitFunction __commit,
                             _ITM_transactionId resumingTransactionId,
                             void *__arg);

extern _ITM_TRANSACTION_PURE
void _ITM_CALL_CONVENTION _ITM_addUserUndoAction(TX_ARGS 
                             const _ITM_userUndoFunction __undo, void * __arg);

extern _ITM_TRANSACTION_PURE
_ITM_transactionId _ITM_CALL_CONVENTION _ITM_getTransactionId(TX_ARG);

extern _ITM_TRANSACTION_PURE
void _ITM_CALL_CONVENTION _ITM_dropReferences(TX_ARGS 
                             const void *__start, size_t __size);

extern _ITM_TRANSACTION_PURE
void _ITM_CALL_CONVENTION _ITM_userError(const char *errString, int exitCode);

extern const char * _ITM_CALL_CONVENTION _ITM_libraryVersion(void);

extern int _ITM_CALL_CONVENTION _ITM_versionCompatible(int version);


extern int _ITM_CALL_CONVENTION _ITM_initializeThread(void);

extern void _ITM_CALL_CONVENTION _ITM_finalizeThread(void);

extern void _ITM_CALL_CONVENTION _ITM_finalizeProcess(void);

extern int _ITM_CALL_CONVENTION _ITM_initializeProcess(void);

extern void _ITM_CALL_CONVENTION _ITM_error(const _ITM_srcLocation *__src,
                             int errorCode);

#if defined(TM_GCC)
extern uint32_t _ITM_beginTransaction(uint32_t __properties, ...)
                             __attribute__((returns_twice));
extern void _ITM_CALL_CONVENTION _ITM_commitTransaction(void);

#elif defined(TM_INTEL) || defined(TM_DTMC)
extern uint32_t _ITM_CALL_CONVENTION _ITM_beginTransaction(TX_ARGS 
                             uint32_t __properties,
                             const _ITM_srcLocation *__src)
                             __attribute__((returns_twice));

extern void _ITM_CALL_CONVENTION _ITM_commitTransaction(TX_ARGS 
                             const _ITM_srcLocation *__src);
#endif /* TM_INTEL */

/* TODO only Intel tryCommit + _ITM_srcLocation  */
extern bool _ITM_CALL_CONVENTION _ITM_tryCommitTransaction(TX_ARGS 
                             const _ITM_srcLocation *__src);

extern void _ITM_CALL_CONVENTION _ITM_commitTransactionToId(TX_ARGS 
                             const _ITM_transactionId tid,
                             const _ITM_srcLocation *__src);

extern void _ITM_CALL_CONVENTION _ITM_abortTransaction(TX_ARGS 
                             _ITM_abortReason __reason,
                             const _ITM_srcLocation *__src);

extern void _ITM_CALL_CONVENTION _ITM_rollbackTransaction(TX_ARGS 
                             const _ITM_srcLocation *__src);

extern void _ITM_CALL_CONVENTION _ITM_registerThrownObject(TX_ARGS 
                             const void *__obj,
                             size_t __size);

extern void _ITM_CALL_CONVENTION _ITM_changeTransactionMode(TX_ARGS 
                             _ITM_transactionState __mode,
                             const _ITM_srcLocation *__loc);

#if defined(TM_GCC)
/**** GCC Specific ****/
extern _ITM_CALL_CONVENTION void *_ITM_getTMCloneOrIrrevocable(void *);
extern _ITM_CALL_CONVENTION void *_ITM_getTMCloneSafe(void *);
extern void _ITM_registerTMCloneTable(void *, size_t);
extern void _ITM_deregisterTMCloneTable(void *);
extern _ITM_CALL_CONVENTION void _ITM_commitTransactionEH(void *);
#endif /* defined(TM_GCC) */

#if defined(TM_GCC) || defined(TM_DTMC)
extern void * _ITM_malloc(size_t);
extern void * _ITM_calloc(size_t, size_t);
extern void _ITM_free(void *);
#endif /* defined(TM_GCC) || defined(TM_DTMC) */


================================================
FILE: stms/tinystm/abi/oracle/Makefile
================================================
# Path to TinySTM
ROOT = ../..

.PHONY:	oracletm all clean check test

all:	oracletm

# ROOT must be defined to include Makefile.common
include $(ROOT)/abi/Makefile.common


##############################################################################
## Oracle TM 
##############################################################################
CPPFLAGS += -DTM_ORACLE -I.

oracletm: 	libitm.a 

arch.o: arch.S
	$(CC) $(CPPFLAGS) $(CFLAGS) $(DEF_ABI) -c -o $@ $<

%.o: 	%.c 
	$(CC) $(CPPFLAGS) $(CFLAGS) $(DEF_ABI) -c -o $@ $<

%.s: 	%.c 
	$(CC) $(CPPFLAGS) $(CFLAGS) $(DEF_ABI) -fverbose-asm -S -o $@ $<

libitm.a: otm.o arch.o 
	$(AR) cru $@ $^

clean:
	rm -f *.o *.do libitm.a


================================================
FILE: stms/tinystm/abi/oracle/arch.S
================================================
#ifndef __linux__
/* Linux specific, Windows uses a different calling convention. */
/* TODO probably solaris is fine with it. */
# error This file is linux specific.
#endif

	.text

#if defined(__x86_64__)
	.align 4
	.globl	STM_BeginTransaction
	.type	STM_BeginTransaction, @function

STM_BeginTransaction:
	.cfi_startproc
/* Paramters (in order) is in rdi, rsi, rdx, rcx, r8, r9 */
/* Temporary registers is r10, r11 (not saved) */
/* To be saved are rbx, rsp, rbp, r12, r13, r14, r15 */
	leaq	8(%rsp), %rax    /* Save stack pointer */
	subq	$56, %rsp
	.cfi_def_cfa_offset 64
	movq	%rax, (%rsp)
	movq	%rbx, 8(%rsp)
	movq	%rbp, 16(%rsp)
	movq	%r12, 24(%rsp)
	movq	%r13, 32(%rsp)
	movq	%r14, 40(%rsp)
	movq	%r15, 48(%rsp)
	movq	%rsp, %rsi
	call	_STM_BeginTransaction
	addq	$56, %rsp
	.cfi_def_cfa_offset 8
	ret
	.cfi_endproc
	.size	STM_BeginTransaction, .-STM_BeginTransaction

	.align 4
	.globl	_ITM_siglongjmp
	.type	_ITM_siglongjmp, @function
	.hidden	_ITM_siglongjmp

_ITM_siglongjmp:
	.cfi_startproc
	movq	(%rsi), %rcx
	movq	8(%rsi), %rbx
	movq	16(%rsi), %rbp
	movq	24(%rsi), %r12
	movq	32(%rsi), %r13
	movq	40(%rsi), %r14
	movq	48(%rsi), %r15
	movl	%edi, %eax
	.cfi_def_cfa %rsi, 0
	.cfi_offset %rip, 56
	.cfi_register %rsp, %rcx
	movq	%rcx, %rsp
	jmp	*56(%rsi)
	.cfi_endproc
	.size	_ITM_siglongjmp, .-_ITM_siglongjmp

#elif defined(__i386__)
	.align 4
	.globl	STM_BeginTransaction
	.type	STM_BeginTransaction, @function
STM_BeginTransaction:
	.cfi_startproc
	leal	4(%esp), %ecx
	movl	4(%esp), %eax
	subl	$28, %esp
	.cfi_def_cfa_offset 32
	movl	%ecx, 8(%esp)
	movl	%ebx, 12(%esp)
	movl	%esi, 16(%esp)
	movl	%edi, 20(%esp)
	movl	%ebp, 24(%esp)
	leal	8(%esp), %edx
	call	_STM_BeginTransaction
	addl	$28, %esp
	.cfi_def_cfa_offset 4
	ret
	.cfi_endproc
	.size	STM_BeginTransaction, .-STM_BeginTransaction


	.align 4
	.globl	_ITM_siglongjmp
	.type	_ITM_siglongjmp, @function
	.hidden	_ITM_siglongjmp
_ITM_siglongjmp:
	.cfi_startproc
	movl	(%edx), %ecx
	movl	4(%edx), %ebx
	movl	8(%edx), %esi
	movl	12(%edx), %edi
	movl	16(%edx), %ebp
	.cfi_def_cfa %edx, 0
	.cfi_offset %eip, 20
	.cfi_register %esp, %ecx
	movl	%ecx, %esp
	jmp	*20(%edx)
	.cfi_endproc
	.size	_ITM_siglongjmp, .-_ITM_siglongjmp

#elif defined(__sparc__)
# error SPARC arch is not yet supported
#else
# error Unsupported architecture
#endif

#ifdef __linux__
.section .note.GNU-stack, "", @progbits
#endif


================================================
FILE: stms/tinystm/abi/oracle/otm.c
================================================
#include <stdint.h>
#include "stm.h"
#include "wrappers.h"
#include "mod_mem.h"

#define CTX_ITM   _ITM_siglongjmp
#define _ITM_CALL_CONVENTION __attribute__((regparm(2)))
extern void _ITM_CALL_CONVENTION _ITM_siglongjmp(int val, sigjmp_buf env) __attribute__ ((noreturn));

#include "stm.c"
#include "mod_cb_mem.c"
#include "wrappers.c"

/* TODO __FUNCTION__ is not available with Oracle Studio if -Xc () and -Xs (K&R mode) but should not be a problem. */
/* __FUNCTION__ and __PRETTY_FUNCTION__ are predefined identifiers that contain the name of the lexically-enclosing function. They are functionally equivalent to the c99 predefined identifier, __func__. On Solaris platforms, __FUNCTION__ and __PRETTY_FUNCTION__ are not available in -Xs and -Xc modes. */

stm_tx_t *STM_GetMyTransId(void)
{
  stm_tx_t *tx = stm_current_tx();
  PRINT_DEBUG("==> %s()\n", __FUNCTION__);
  if (tx == NULL) {
    stm_init();
    mod_mem_init(0);
    tx = stm_init_thread();
    /* TODO save stack high and low addr */
  }
  PRINT_DEBUG("==> %s() -> 0x%p\n", __FUNCTION__, tx);
  return tx;
}

__attribute__((regparm(2)))
int _STM_BeginTransaction(stm_tx_t *tx, jmp_buf *buf)
{
  sigjmp_buf * env;
  PRINT_DEBUG("==> %s(0x%p)\n", __FUNCTION__, tx);
  /* TODO see how the ctx is saved and rollback. */
  env = int_stm_start(tx, (stm_tx_attr_t)0);
  if (likely(env != NULL))
    memcpy(env, buf, sizeof(jmp_buf)); /* TODO limit size to real size */
  return 1;
}

int STM_ValidateTransaction(stm_tx_t *tx)
{
  /*int ret;*/
  PRINT_DEBUG("==> %s(0x%p)\n", __FUNCTION__, tx);
  /*ret = stm_validate(tx);*/
  return 1 /*ret*/;
}

int STM_CommitTransaction(stm_tx_t *tx)
{
  int ret;
  PRINT_DEBUG("==> %s(tx=0x%p)\n", __FUNCTION__, tx);
  ret = stm_commit_tx(tx);
  /* Returned value are : Abort no retry=-1 / Abort retry=0 / Committed no retry=1 */
  return ret;
}

/* API uses many steps: acquisition, then transactional read
 * to apply this to tinySTM it requires to split stm_load function in many pieces */

typedef void * RdHandle;
typedef void * WrHandle;

RdHandle *STM_AcquireReadPermission(stm_tx_t *tx, stm_word_t *addr, int valid)
{
  PRINT_DEBUG("==> %s(tx=0x%p,addr=0x%p,valid=%d)\n", tx, addr, valid);
  return NULL;
}

WrHandle *STM_AcquireWritePermission(stm_tx_t *tx, stm_word_t *addr, int valid)
{
  PRINT_DEBUG("==> %s(tx=0x%p,addr=0x%p,valid=%d)\n", tx, addr, valid);
  return NULL;
}

WrHandle* STM_AcquireReadWritePermission(stm_tx_t *tx, stm_word_t *addr, int valid)
{
  PRINT_DEBUG("==> %s(tx=0x%p,addr=0x%p,valid=%d)\n", __FUNCTION__, tx, addr, valid);
  return NULL;
}


/* Transactional loads */

uint8_t STM_TranRead8(stm_tx_t *tx, RdHandle *theRdHandle, uint8_t *addr, int valid)
{
  PRINT_DEBUG("==> %s(tx=0x%p,handle=0x%p,addr=0x%p,valid=%d)\n", __FUNCTION__, tx, theRdHandle, addr, valid);
  return stm_load_u8(/*tx,*/ addr);
}

uint16_t STM_TranRead16(stm_tx_t *tx, RdHandle *theRdHandle, uint16_t *addr, int valid)
{
  PRINT_DEBUG("==> %s(tx=0x%p,handle=0x%p,addr=0x%p,valid=%d)\n", __FUNCTION__, tx, theRdHandle, addr, valid);
  return stm_load_u16(/*tx,*/ addr);
}

uint32_t STM_TranRead32(stm_tx_t *tx, RdHandle *theRdHandle, uint32_t *addr, int valid)
{
  PRINT_DEBUG("==> %s(tx=0x%p,handle=0x%p,addr=0x%p,valid=%d)\n", __FUNCTION__, tx, theRdHandle, addr, valid);
  /* TODO can it be more efficient with #ifdef _LP64 and stm_load(). */
  return stm_load_u32(/*tx,*/ addr);
}

double STM_TranReadFloat32(stm_tx_t *tx, RdHandle *theRdHandle, float *addr, int valid)
{
  PRINT_DEBUG("==> %s(tx=0x%p,handle=0x%p,addr=0x%p,valid=%d)\n", __FUNCTION__, tx, theRdHandle, addr, valid);
  return stm_load_float(/*tx,*/ addr);
}

uint64_t STM_TranRead64(stm_tx_t *tx, RdHandle *theRdHandle, uint64_t *addr, int valid)
{
  PRINT_DEBUG("==> %s(tx=0x%p,handle=0x%p,addr=0x%p,valid=%d)\n", __FUNCTION__, tx, theRdHandle, addr, valid);
  return stm_load_u64(/*tx,*/ addr);
}

double STM_TranReadFloat64(stm_tx_t *tx, RdHandle *theRdHandle, double *addr, int valid)
{
  PRINT_DEBUG("==> %s(tx=0x%p,handle=0x%p,addr=0x%p,valid=%d)\n", __FUNCTION__, tx, theRdHandle, addr, valid);
  return stm_load_double(/*tx,*/ addr);
}


/* Transactional stores */

int STM_TranWrite8(stm_tx_t *tx, WrHandle* theWrHandle, uint8_t *addr,  uint8_t val, int valid)
{
  PRINT_DEBUG("==> %s(tx=0x%p,handle=0x%p,addr=0x%p,val=%u,valid=%d)\n", __FUNCTION__, tx, theWrHandle, addr, val, valid);
  stm_store_u8(/*tx,*/ addr, val);
  return 1;
}
int STM_TranWrite16(stm_tx_t *tx, WrHandle* theWrHandle, uint16_t *addr, uint16_t val, int valid)
{
  PRINT_DEBUG("==> %s(tx=0x%p,handle=0x%p,addr=0x%p,val=%u,valid=%d)\n", __FUNCTION__, tx, theWrHandle, addr, val, valid);
  stm_store_u16(/*tx,*/ addr, val);
  return 1;
}

int STM_TranWrite32(stm_tx_t *tx, WrHandle *theWrHandle, uint32_t *addr, uint32_t val, int valid)
{
  PRINT_DEBUG("==> %s(tx=0x%p,handle=0x%p,addr=0x%p,val=%u,valid=%d)\n", __FUNCTION__, tx, theWrHandle, addr, val, valid);
  stm_store_u32(/*tx,*/ addr, val);
  return 1;
}

int STM_TranWrite64(stm_tx_t *tx, WrHandle *theWrHandle, uint64_t *addr, uint64_t val, int valid)
{
  PRINT_DEBUG("==> %s(tx=0x%p,handle=0x%p,addr=0x%p,val=%lu,valid=%d)\n", __FUNCTION__, tx, theWrHandle, addr, val, valid);
  stm_store_u64(/*tx,*/ addr, val);
  return 1;
}

int STM_TranWriteFloat32(stm_tx_t *tx, WrHandle *theWrHandle, float *addr, float val, int valid)
{
  PRINT_DEBUG("==> %s(tx=0x%p,handle=0x%p,addr=0x%p,val=%f,valid=%d)\n", __FUNCTION__, tx, theWrHandle, addr, val, valid);
  stm_store_float(/*tx,*/ addr, val);
  return 1;
}

int STM_TranWriteFloat64(stm_tx_t *tx, WrHandle *theWrHandle, double *addr, double val, int valid)
{
  PRINT_DEBUG("==> %s(tx=0x%p,handle=0x%p,addr=0x%p,val=%f,valid=%d)\n", __FUNCTION__, tx, theWrHandle, addr, val, valid);
  stm_store_double(/*tx,*/ addr, val);
  return 1;
}


/* Transactional memory management */

void *STM_TranMalloc(stm_tx_t *tx, size_t sz)
{
  PRINT_DEBUG("==> %s(tx=0x%p,size=%d)\n", __FUNCTION__, tx, sz);
  return stm_malloc_tx(tx, sz);
}

void *STM_TranCalloc(stm_tx_t *tx, size_t elem, size_t sz)
{
  PRINT_DEBUG("==> %s(tx=0x%p,elem=%d,size=%d)\n", __FUNCTION__, tx, elem, sz);
  return stm_calloc_tx(tx, elem, sz);
}

void STM_TranMFree(stm_tx_t *tx, void *addr)
{
  PRINT_DEBUG("==> %s(tx=0x%p,addr=%p)\n", __FUNCTION__, tx, addr);
  /* TODO: guess the size... use as in itm block_size(ptr)? is it available in Solaris? */
  stm_free_tx(tx, addr, sizeof(stm_word_t));
}

void *STM_TranMemAlign(stm_tx_t *tx, size_t alignment, size_t sz)
{
  assert(0);
  return NULL;
}

void *STM_TranValloc(stm_tx_t *tx, size_t sz)
{
  assert(0);
  return NULL;
}

/* TODO check if sz is size_t? same for alignment? */
void STM_TranMemCpy(stm_tx_t *tx, void* src, void* dst, size_t sz, uint32_t alignment)
{
  /* TODO what to do with alignment? */
  uint8_t *buf = (uint8_t *)alloca(sz);
  stm_load_bytes(/*tx,*/ (volatile uint8_t *)src, buf, sz);
  stm_store_bytes(/*tx,*/ (volatile uint8_t *)dst, buf, sz);
}

int STM_CurrentlyUsingDecoratedPath(stm_tx_t* tx)
{
  if (tx == NULL)
    return 0;
  /* TODO check that the nesting is != 0 */
  return 1; 
}


================================================
FILE: stms/tinystm/abi/pthread_wrapper.h
================================================
/*
 * File:
 *   pthread_wrapper.h
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * Description:
 *   Pthread wrapper to handle thread creation.
 *
 * Copyright (c) 2007-2014.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * This program has a dual license and can also be distributed
 * under the terms of the MIT license.
 */

#include <dlfcn.h> 

/* Original pthread function */
static int (*pthread_create_orig)(pthread_t *__restrict,
                                  __const pthread_attr_t *__restrict,
                                  void *(*)(void *),
                                  void *__restrict) = NULL; 

typedef struct {
  void * (*start_routine)(void *);
  void * arg;
} wrapper_t; 

static void * wpthread_create(void * data)
{
  void * (*start_routine)(void *) = ((wrapper_t*)data)->start_routine;
  void * arg = ((wrapper_t*)data)->arg; 
  void * ret;
  /* Free the allocated memory by the wrapper */
  free(data);
  /* Initialize thread */
  _ITM_initializeThread();
  /* Call user function */
  ret = start_routine(arg);
  /* Finalizing thread */
  _ITM_finalizeThread();
  return ret;
}

int pthread_create(pthread_t *__restrict thread,
                   __const pthread_attr_t *__restrict attr,
                   void * (*start_routine)(void *),
                   void *__restrict arg)
{ 
  int i_return;
  /* Allocate memory to pass as argument (we can't assume that stack will not be modified) */
  wrapper_t * wdata = malloc(sizeof(wrapper_t));
  wdata->start_routine = start_routine;
  wdata->arg = arg;
  /* Solving link to original pthread_create */
  if (pthread_create_orig == NULL) {
    pthread_create_orig = dlsym(RTLD_NEXT, "pthread_create");
    if (pthread_create_orig == NULL) {
      char *error = dlerror();
      if (error == NULL) {
        error = "pthread_create can't be solved.";
      }  
      fprintf(stderr, "%s\n", error);
      exit(EXIT_FAILURE);
    }
  }
  /* Call original pthread function */
  i_return = pthread_create_orig(thread, attr, wpthread_create, wdata);
  return i_return;
}


================================================
FILE: stms/tinystm/abi/test/Makefile
================================================
# Path to TinySTM
ROOT = ../..

.PHONY:	gcc all clean test check

all:	gcc

# ROOT must be defined to include Makefile.common
include $(ROOT)/abi/Makefile.common

##############################################################################
## GCC
##############################################################################
# TODO Currently it is a copy of GCC-TM Makefile but I didn't find a best way
# to do that until now
CPPFLAGS += -DTM_GCC -I../gcc

# NOTES
#   lib.map enables to export only some functions
gcc: 	libitm.so libitm.a 

arch.o: ../gcc/arch.S
	$(CC) $(CPPFLAGS) $(CFLAGS) $(DEF_ABI) -c -o $@ $<

%.do: 	../%.c 
	$(CC) -fPIC $(CPPFLAGS) $(CFLAGS) $(DEF_ABI) -c -o $@ $<

%.o: 	../%.c 
	$(CC) $(CPPFLAGS) $(CFLAGS) $(DEF_ABI) -c -o $@ $<

libitm.a: abi.o arch.o 
	$(AR) cru $@ $^

libitm.so: 	abi.do arch.o 
	$(CC) -fPIC $(CPPFLAGS) $(CFLAGS) -shared -Wl,--version-script,../lib.map -o $@ $^
# TODO Check if strip is really needed
#	strip $@
#	cp libitm.so libitm.so.1
#	ln -s libitm.so libitm.so.1
#TODO for FAT filesystem, ln doesn't work
##############################################################################

TESTCC       ?= gcc
TESTCFLAGS   += -Wall -O2 -march=native -DTM_ABI -I../gcc/ -DTLS 
TESTLD       ?= gcc
TESTLDFLAGS  += -Wl,-rpath=$(shell pwd)
TESTLDFLAGS  += -L$(shell pwd) 
# TESTLDFLAGS  += libitm.a
# FIXME: see why big perf degradation with shared library
TESTLDFLAGS  += -litm

clean: 	intset-clean
	rm -f *.o *.do libitm.a libitm.so


================================================
FILE: stms/tinystm/abi/tm_macros.h
================================================
/*
 * File:
 *   tm_macros.h
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * Description:
 *   Defines macros for transactional operations.
 *
 * Copyright (c) 2007-2014.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * This program has a dual license and can also be distributed
 * under the terms of the MIT license.
 */

#ifndef _TM_MACROS_H_
# define _TM_MACROS_H_
/* Compile with explicit calls to ITM library */
# include <bits/wordsize.h>
# include "libitm.h"

/* Define TM_MACROS */
# ifdef EXPLICIT_TX_PARAMETER
#  define TXARG                         __td
#  define TXARGS                        __td,
#  define TM_START(id, ro)              { _ITM_transaction* __td = _ITM_getTransaction(); \
                                          _ITM_beginTransaction(TXARGS ro == RO ? pr_readOnly | pr_instrumentedCode : pr_instrumentedCode, NULL);
# else
#  define TXARG
#  define TXARGS
#  define TM_START(id, ro)              { _ITM_beginTransaction(ro == RO ? pr_readOnly | pr_instrumentedCode : pr_instrumentedCode, NULL);
# endif
// TODO check if __LP64__ is better?
# if __WORDSIZE == 64
#  define TM_LOAD(addr)                 _ITM_RU8(TXARGS (uint64_t *)addr)
#  define TM_STORE(addr, value)         _ITM_WU8(TXARGS (uint64_t *)addr, (uint64_t)value)
# else /* __WORDSIZE == 32 */
#  define TM_LOAD(addr)                 _ITM_RU4(TXARGS (uint32_t *)addr)
#  define TM_STORE(addr, value)         _ITM_WU4(TXARGS (uint32_t *)addr, (uint32_t)value)
# endif /* __WORDSIZE == 32 */
# define TM_COMMIT                      _ITM_commitTransaction(TXARGS); }
/* TODO Wrong for Intel */
# define TM_MALLOC(size)                _ITM_malloc(TXARGS size)
# define TM_FREE(addr)                  _ITM_free(TXARGS addr)
# define TM_FREE2(addr, size)           _ITM_free(TXARGS addr)

# define TM_INIT                        _ITM_initializeProcess()
# define TM_EXIT                        _ITM_finalizeProcess()
# define TM_INIT_THREAD                 _ITM_initializeThread()
# define TM_EXIT_THREAD                 _ITM_finalizeThread()

/* Annotations used in this benchmark */
# define TM_SAFE
# define TM_PURE

#endif /* _TM_MACROS_H_ */


================================================
FILE: stms/tinystm/include/mod_ab.h
================================================
/*
 * File:
 *   mod_ab.h
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * Description:
 *   Module for gathering statistics about atomic blocks.
 *
 * Copyright (c) 2007-2014.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * This program has a dual license and can also be distributed
 * under the terms of the MIT license.
 */

/**
 * @file
 *   Module for gathering statistics about transactions.  This module
 *   maintains aggregate statistics about all threads for every atomic
 *   block in the application (distinguished using the identifier part
 *   of the transaction attributes).
 * @author
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * @date
 *   2007-2014
 */

#ifndef _MOD_AB_H_
# define _MOD_AB_H_

# include "stm.h"

# ifdef __cplusplus
extern "C" {
# endif

/**
 * Statistics associated with an atomic block.
 */
typedef struct stm_ab_stats {
  /**
   * Number of samples collected.
   */
  unsigned long samples;
  /**
   * Arithmetic mean of the samples.
   */
  double mean;
  /**
   * Variance of the samples.
   */
  double variance;
  /**
   * Minimum value among all samples.
   */
  double min;
  /**
   * Maximum value among all samples.
   */
  double max;
  /**
   * 75th percentile (median).
   */
  double percentile_50;
  /**
   * 90th percentile.
   */
  double percentile_90;
  /**
   * 95th percentile.
   */
  double percentile_95;
  /**
   * Sorted ramdom subset of the samples (Vitter's reservoir).
   */
  double *reservoir;
  /**
   * Number of smaples in the reservoir.
   */
  unsigned int reservoir_size;
} stm_ab_stats_t;

/**
 * Get statistics about an atomic block.
 *
 * @param id
 *   Identifier of the atomic block (as specified in transaction
 *   attributes).
 * @param stats
 *   Pointer to the variable to should hold the statistics of the atomic
 *   block.
 * @return
 *   1 upon success, 0 otherwise.
 */
int stm_get_ab_stats(int id, stm_ab_stats_t *stats);

/**
 * Initialize the module.  This function must be called once, from the
 * main thread, after initializing the STM library and before
 * performing any transactional operation.
 *
 * @param freq
 *   Inverse sampling frequency (1 to keep all samples).
 * @param check
 *   Pointer to a function that will be called to check if a sample is
 *   valid and should be kept.  The event will be discarded if and only
 *   if the function returns 0.  If no function is provided, all samples
 *   will be kept.
 */
void mod_ab_init(int freq, int (*check)(void));

# ifdef __cplusplus
}
# endif

#endif /* _MOD_AB_H_ */


================================================
FILE: stms/tinystm/include/mod_cb.h
================================================
/*
 * File:
 *   mod_cb.h
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * Description:
 *   Module for user callbacks.
 *
 * Copyright (c) 2007-2014.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * This program has a dual license and can also be distributed
 * under the terms of the MIT license.
 */

/**
 * @file
 *   Module for user callbacks.
 * @author
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * @date
 *   2007-2014
 */

#ifndef _MOD_CB_H_
# define _MOD_CB_H_

# include "stm.h"

# ifdef __cplusplus
extern "C" {
# endif

/**
 * Register an application-specific callback triggered when the current
 * transaction commits.  The callback is automatically unregistered once
 * the transaction commits or aborts.  If the transaction aborts, the
 * callback is never triggered.
 *
 * @param on_commit
 *   Function called upon successful transaction commit.
 * @param arg
 *   Parameter to be passed to the callback function.
 * @return
 *   1 if the callbacks have been successfully registered, 0 otherwise.
 */
int stm_on_commit(void (*on_commit)(void *arg), void *arg);

/**
 * Register an application-specific callback triggered when the current
 * transaction aborts.  The callback is automatically unregistered once
 * the transaction commits or aborts.  If the transaction commits, the
 * callback is never triggered.
 *
 * @param on_abort
 *   Function called upon transaction abort.
 * @param arg
 *   Parameter to be passed to the callback function.
 * @return
 *   1 if the callbacks have been successfully registered, 0 otherwise.
 */
int stm_on_abort(void (*on_abort)(void *arg), void *arg);

/**
 * Initialize the module.  This function must be called once, from the
 * main thread, after initializing the STM library and before
 * performing any transactional operation.
 *
 */
void mod_cb_init(void);

# ifdef __cplusplus
}
# endif

#endif /* _MOD_CB_H_ */


================================================
FILE: stms/tinystm/include/mod_log.h
================================================
/*
 * File:
 *   mod_log.h
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * Description:
 *   Module for logging memory accesses.
 *
 * Copyright (c) 2007-2014.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * This program has a dual license and can also be distributed
 * under the terms of the MIT license.
 */

/**
 * @file
 *   Module for logging memory accesses.  Data is stored in an undo log.
 *   Upon abort, modifications are reverted.  Note that this module
 *   should not be used for updating shared data as there are no
 *   mechanisms to deal with concurrent accesses.
 * @author
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * @date
 *   2007-2014
 */

#ifndef _MOD_LOG_H_
# define _MOD_LOG_H_

# include "stm.h"

# ifdef __cplusplus
extern "C" {
# endif

/**
 * Log word-sized value in transaction log.
 *
 * @param addr
 *   Address of the memory location.
 */
void stm_log(stm_word_t *addr);

/**
 * Log char 8-bit value in transaction log.
 *
 * @param addr
 *   Address of the memory location.
 */
void stm_log_u8(uint8_t *addr);

/**
 * Log char 16-bit value in transaction log.
 *
 * @param addr
 *   Address of the memory location.
 */
void stm_log_u16(uint16_t *addr);

/**
 * Log char 32-bit value in transaction log.
 *
 * @param addr
 *   Address of the memory location.
 */
void stm_log_u32(uint32_t *addr);

/**
 * Log char 64-bit value in transaction log.
 *
 * @param addr
 *   Address of the memory location.
 */
void stm_log_u64(uint64_t *addr);

/**
 * Log char value in transaction log.
 *
 * @param addr
 *   Address of the memory location.
 */
void stm_log_char(char *addr);

/**
 * Log unsigned char value in transaction log.
 *
 * @param addr
 *   Address of the memory location.
 */
void stm_log_uchar(unsigned char *addr);

/**
 * Log short value in transaction log.
 *
 * @param addr
 *   Address of the memory location.
 */
void stm_log_short(short *addr);

/**
 * Log unsigned short value in transaction log.
 *
 * @param addr
 *   Address of the memory location.
 */
void stm_log_ushort(unsigned short *addr);

/**
 * Log int value in transaction log.
 *
 * @param addr
 *   Address of the memory location.
 */
void stm_log_int(int *addr);

/**
 * Log unsigned int value in transaction log.
 *
 * @param addr
 *   Address of the memory location.
 */
void stm_log_uint(unsigned int *addr);

/**
 * Log long value in transaction log.
 *
 * @param addr
 *   Address of the memory location.
 */
void stm_log_long(long *addr);

/**
 * Log unsigned long value in transaction log.
 *
 * @param addr
 *   Address of the memory location.
 */
void stm_log_ulong(unsigned long *addr);

/**
 * Log float value in transaction log.
 *
 * @param addr
 *   Address of the memory location.
 */
void stm_log_float(float *addr);

/**
 * Log double value in transaction log.
 *
 * @param addr
 *   Address of the memory location.
 */
void stm_log_double(double *addr);

/**
 * Log pointer value in transaction log.
 *
 * @param addr
 *   Address of the memory location.
 */
void stm_log_ptr(void **addr);

/**
 * Log memory region in transaction log.
 *
 * @param addr
 *   Address of the memory location.
 * @param size
 *   Number of bytes to log.
 */
void stm_log_bytes(uint8_t *addr, size_t size);

/**
 * Initialize the module.  This function must be called once, from the
 * main thread, after initializing the STM library and before performing
 * any transactional operation.
 */
void mod_log_init(void);

# ifdef __cplusplus
}
# endif

#endif /* _MOD_LOG_H_ */


================================================
FILE: stms/tinystm/include/mod_mem.h
================================================
/*
 * File:
 *   mod_mem.h
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * Description:
 *   Module for dynamic memory management.
 *
 * Copyright (c) 2007-2014.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * This program has a dual license and can also be distributed
 * under the terms of the MIT license.
 */

/**
 * @file
 *   Module for dynamic memory management.  This module provides
 *   functions for allocations and freeing memory inside transactions.
 *   A block allocated inside the transaction will be implicitly freed
 *   upon abort, and a block freed inside a transaction will only be
 *   returned to the system upon commit.
 * @author
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * @date
 *   2007-2014
 */

#ifndef _MOD_MEM_H_
# define _MOD_MEM_H_

# include "stm.h"

# ifdef __cplusplus
extern "C" {
# endif

//@{
/**
 * Allocate memory from inside a transaction.  Allocated memory is
 * implicitly freed upon abort.
 *
 * @param size
 *   Number of bytes to allocate.
 * @return
 *   Pointer to the allocated memory block.
 */
void *stm_malloc(size_t size);
void *stm_malloc_tx(struct stm_tx *tx, size_t size);
//@}

//@{
/**
 * Allocate initialized memory from inside a transaction.  Allocated
 * memory is implicitly freed upon abort.
 *
 * @param nm
 *   Size of the array to allocate.
 * @param size
 *   Number of bytes to allocate.
 * @return
 *   Pointer to the allocated memory block.
 */
void *stm_calloc(size_t nm, size_t size);
void *stm_calloc_tx(struct stm_tx *tx, size_t nm, size_t size);
//@}

//@{
/**
 * Free memory from inside a transaction.  Freed memory is only returned
 * to the system upon commit and can optionally be overwritten (more
 * precisely, the locks protecting the memory are acquired) to prevent
 * another transaction from accessing the freed memory and observe
 * inconsistent states.
 *
 * @param addr
 *   Address of the memory block.
 * @param size
 *   Number of bytes to overwrite.
 */
void stm_free(void *addr, size_t size);
void stm_free_tx(struct stm_tx *tx, void *addr, size_t size);
//@}

//@{
/**
 * Free memory from inside a transaction.  Freed memory is only returned
 * to the system upon commit and can optionally be overwritten (more
 * precisely, the locks protecting the memory are acquired) to prevent
 * another transaction from accessing the freed memory and observe
 * inconsistent states.
 *
 * @param addr
 *   Address of the memory block.
 * @param idx
 *   Index of the first byte to overwrite.
 * @param size
 *   Number of bytes to overwrite.
 */
void stm_free2(void *addr, size_t idx, size_t size);
void stm_free2_tx(struct stm_tx *tx, void *addr, size_t idx, size_t size);
//@}

/**
 * Initialize the module.  This function must be called once, from the
 * main thread, after initializing the STM library and before
 * performing any transactional operation.
 *
 * @param gc
 *   True (non-zero) to enable epoch-based garbage collector when
 *   freeing memory in transactions.
 */
void mod_mem_init(int gc);

# ifdef __cplusplus
}
# endif

#endif /* _MOD_MEM_H_ */


================================================
FILE: stms/tinystm/include/mod_order.h
================================================
/*
 * File:
 *   mod_order.h
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * Description:
 *   Module to force transactions to commit in order.
 *
 * Copyright (c) 2007-2014.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * This program has a dual license and can also be distributed
 * under the terms of the MIT license.
 */

/**
 * @file
 *   Module to force transactions to commit in order. The first transaction that
 *   starts will be the first one to commit. This module requires CM_MODULAR.
 * @author
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * @date
 *   2007-2014
 */

#ifndef _MOD_PRINT_H_
# define _MOD_PRINT_H_

# ifdef __cplusplus
extern "C" {
# endif

/**
 * Initialize the module.  This function must be called once, from the
 * main thread, after initializing the STM library and before
 * performing any transactional operation.
 */
void mod_order_init(void);

# ifdef __cplusplus
}
# endif

#endif /* _MOD_PRINT_H_ */


================================================
FILE: stms/tinystm/include/mod_print.h
================================================
/*
 * File:
 *   mod_print.h
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * Description:
 *   Module to test callbacks.
 *
 * Copyright (c) 2007-2014.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * This program has a dual license and can also be distributed
 * under the terms of the MIT license.
 */

/**
 * @file
 *   Module to test callbacks.  This module simply prints a message at
 *   each invocation of a callback.
 * @author
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * @date
 *   2007-2014
 */

#ifndef _MOD_PRINT_H_
# define _MOD_PRINT_H_

# ifdef __cplusplus
extern "C" {
# endif

/**
 * Initialize the module.  This function must be called once, from the
 * main thread, after initializing the STM library and before
 * performing any transactional operation.
 */
void mod_print_init(void);

# ifdef __cplusplus
}
# endif

#endif /* _MOD_PRINT_H_ */


================================================
FILE: stms/tinystm/include/mod_stats.h
================================================
/*
 * File:
 *   mod_stats.h
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * Description:
 *   Module for gathering statistics about transactions.
 *
 * Copyright (c) 2007-2014.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * This program has a dual license and can also be distributed
 * under the terms of the MIT license.
 */

/**
 * @file
 *   Module for gathering statistics about transactions.  This module
 *   maintains both aggregate statistics about all threads (aggregates
 *   are updated upon thread cleanup) and per-thread statistics.  The
 *   built-in statistics of the core STM library are more efficient and
 *   detailed but this module is useful in case the library is compiled
 *   without support for statistics.
 * @author
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * @date
 *   2007-2014
 */

#ifndef _MOD_STATS_H_
# define _MOD_STATS_H_

# include "stm.h"

# ifdef __cplusplus
extern "C" {
# endif

/**
 * Get various statistics about the transactions of all threads.  See
 * the source code (mod_stats.c) for a list of supported statistics.
 *
 * @param name
 *   Name of the statistics.
 * @param val
 *   Pointer to the variable that should hold the value of the
 *   statistics.
 * @return
 *   1 upon success, 0 otherwise.
 */
int stm_get_global_stats(const char *name, void *val);

/**
 * Get various statistics about the transactions of the current thread.
 * See the source code (mod_stats.c) for a list of supported statistics.
 *
 * @param name
 *   Name of the statistics.
 * @param val
 *   Pointer to the variable that should hold the value of the
 *   statistics.
 * @return
 *   1 upon success, 0 otherwise.
 */
int stm_get_local_stats(const char *name, void *val);

/**
 * Initialize the module.  This function must be called once, from the
 * main thread, after initializing the STM library and before
 * performing any transactional operation.
 */
void mod_stats_init(void);

# ifdef __cplusplus
}
# endif

#endif /* _MOD_STATS_H_ */


================================================
FILE: stms/tinystm/include/stm.h
================================================
/*
 * File:
 *   stm.h
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * Description:
 *   STM functions.
 *
 * Copyright (c) 2007-2014.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * This program has a dual license and can also be distributed
 * under the terms of the MIT license.
 */

/**
 * @file
 *   STM functions.  This library contains the core functions for
 *   programming with STM.
 * @author
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * @date
 *   2007-2014
 */

/**
 * @mainpage TinySTM
 *
 * @section overview_sec Overview
 *
 *   TinySTM is a lightweight but efficient word-based STM
 *   implementation.  This distribution includes three versions of
 *   TinySTM: write-back (updates are buffered until commit time),
 *   write-through (updates are directly written to memory), and
 *   commit-time locking (locks are only acquired upon commit).  The
 *   version can be selected by editing the makefile, which documents
 *   all the different compilation options.
 *
 *   TinySTM compiles and runs on 32 or 64-bit architectures.  It was
 *   tested on various flavors of Unix, on Mac OS X, and on Windows
 *   using cygwin.  It comes with a few test applications, notably a
 *   linked list, a skip list, and a red-black tree.
 *
 * @section install_sec Installation
 *
 *   TinySTM requires the atomic_ops library, freely available from
 *   http://www.hpl.hp.com/research/linux/atomic_ops/.  A stripped-down
 *   version of the library is included in the TinySTM distribution.  If you
 *   wish to use another version, you must set the environment variable
 *   <c>LIBAO_HOME</c> to the installation directory of atomic_ops.
 *
 *   If your system does not support GCC thread-local storage, set the
 *   variable <c>TLS</c> to TLS_POSIX value into the Makefile.common.
 *
 *   To compile TinySTM libraries, execute <c>make</c> in the main
 *   directory.  To compile test applications, execute <c>make test</c>.
 *
 * @section contact_sec Contact
 *
 *   - E-mail : tinystm@tinystm.org
 *   - Web    : http://tinystm.org
 */

#ifndef _STM_H_
# define _STM_H_

# include <setjmp.h>
# include <stdint.h>
# include <stdio.h>
# include <stdlib.h>

/**
 * Version string
 */
# define STM_VERSION                    "1.0.6"
/**
 * Version number (times 100)
 */
# define STM_VERSION_NB                 106

/**
 * Calling convention
 */
# ifdef __i386__
/* The fastcall calling convention improves performance on old ia32
 * architecture that does not implement store forwarding.
 * regparm(3) does not improve significantly the performance. */
#  define _CALLCONV                     __attribute__((fastcall))
# else
#  define _CALLCONV
# endif /* __i386__ */

# ifdef __cplusplus
extern "C" {
# endif

struct stm_tx;
/**
 * Return the current transaction descriptor.
 * The library does not require to pass the current transaction as a
 * parameter to the functions (the current transaction is stored in a
 * thread-local variable).  One can, however, use the library with
 * explicit transaction parameters.  This is useful, for instance, for
 * performance on architectures that do not support TLS or for easier
 * compiler integration.
 */
struct stm_tx *stm_current_tx(void) _CALLCONV;

/* ################################################################### *
 * TYPES
 * ################################################################### */

/**
 * Size of a word (accessible atomically) on the target architecture.
 * The library supports 32-bit and 64-bit architectures.
 */
typedef uintptr_t stm_word_t;

/**
 * Transaction attributes specified by the application.
 */
typedef union stm_tx_attr {
  struct {
  /**
   * Application-specific identifier for the transaction.  Typically,
   * each transactional construct (atomic block) should have a different
   * identifier.  This identifier can be used by the infrastructure for
   * improving performance, for instance by not scheduling together
   * atomic blocks that have conflicted often in the past.
   */
  unsigned int id : 16;
  /**
   * Indicates whether the transaction is read-only.  This information
   * is used as a hint.  If a read-only transaction performs a write, it
   * is aborted and restarted in read-write mode.  In that case, the
   * value of the read-only flag is changed to false.  If no attributes
   * are specified when starting a transaction, it is assumed to be
   * read-write.
   */
  unsigned int read_only : 1;
  /**
   * Indicates whether the transaction should use visible reads.  This
   * information is used when the transaction starts or restarts.  If a
   * transaction automatically switches to visible read mode (e.g.,
   * after having repeatedly aborted with invisible reads), this flag is
   * updated accordingly.  If no attributes are specified when starting
   * a transaction, the default behavior is to use invisible reads.
   */
  unsigned int visible_reads : 1;
  /**
   * Indicates that the transaction should not retry execution using
   * sigsetjmp() after abort.  If no attributes are specified when
   * starting a transaction, the default behavior is to retry.
   */
  unsigned int no_retry : 1;
  /**
   * Indicates that the transaction cannot use the snapshot extension
   * mechanism. (Working only with UNIT_TX)
   */
  unsigned int no_extend : 1;
  /**
   * Indicates that the transaction is irrevocable.
   * 1 is simple irrevocable and 3 is serial irrevocable.
   * (Working only with IRREVOCABLE_ENABLED)
   * TODO Not yet implemented
   */
  /* unsigned int irrevocable : 2; */
  };
  /**
   * All transaction attributes represented as one integer.
   * For convenience, allow (stm_tx_attr_t)0 cast.
   */
  int32_t attrs;
} stm_tx_attr_t;

/**
 * Reason for aborting (returned by sigsetjmp() upon transaction
 * restart).
 */
enum {
  /**
   * Indicates that the instrumented code path must be executed.
   */
  STM_PATH_INSTRUMENTED = 0x01,
  /**
   * Indicates that the uninstrumented code path must be executed
   * (serial irrevocable mode).
   */
  STM_PATH_UNINSTRUMENTED = 0x02,
  /**
   * Abort due to explicit call from the programmer.
   */
  STM_ABORT_EXPLICIT = (1 << 5),
  /**
   * Abort and no retry due to explicit call from the programmer.
   */
  STM_ABORT_NO_RETRY = (1 << 5) | (0x01 << 8),
  /**
   * Implicit abort (high order bits indicate more detailed reason).
   */
  STM_ABORT_IMPLICIT = (1 << 6),
  /**
   * Abort upon reading a memory location being read by another
   * transaction.
   */
  STM_ABORT_RR_CONFLICT = (1 << 6) | (0x01 << 8),
  /**
   * Abort upon writing a memory location being read by another
   * transaction.
   */
  STM_ABORT_RW_CONFLICT = (1 << 6) | (0x02 << 8),
  /**
   * Abort upon reading a memory location being written by another
   * transaction.
   */
  STM_ABORT_WR_CONFLICT = (1 << 6) | (0x03 << 8),
  /**
   * Abort upon writing a memory location being written by another
   * transaction.
   */
  STM_ABORT_WW_CONFLICT = (1 << 6) | (0x04 << 8),
  /**
   * Abort upon read due to failed validation.
   */
  STM_ABORT_VAL_READ = (1 << 6) | (0x05 << 8),
  /**
   * Abort upon write due to failed validation.
   */
  STM_ABORT_VAL_WRITE = (1 << 6) | (0x06 << 8),
  /**
   * Abort upon commit due to failed validation.
   */
  STM_ABORT_VALIDATE = (1 << 6) | (0x07 << 8),
  /**
   * Abort upon deferring to an irrevocable transaction.
   */
  STM_ABORT_IRREVOCABLE = (1 << 6) | (0x09 << 8),
  /**
   * Abort due to being killed by another transaction.
   */
  STM_ABORT_KILLED = (1 << 6) | (0x0A << 8),
  /**
   * Abort due to receiving a signal.
   */
  STM_ABORT_SIGNAL = (1 << 6) | (0x0B << 8),
  /**
   * Abort due to reaching the write set size limit.
   */
  STM_ABORT_EXTEND_WS = (1 << 6) | (0x0C << 8),
  /**
   * Abort due to other reasons (internal to the protocol).
   */
  STM_ABORT_OTHER = (1 << 6) | (0x0F << 8)
};

/* ################################################################### *
 * FUNCTIONS
 * ################################################################### */

/**
 * Initialize the STM library.  This function must be called once, from
 * the main thread, before any access to the other functions of the
 * library.
 */
void stm_init(void) _CALLCONV;

/**
 * Clean up the STM library.  This function must be called once, from
 * the main thread, after all transactional threads have completed.
 */
void stm_exit(void) _CALLCONV;

/**
 * Initialize a transactional thread.  This function must be called once
 * from each thread that performs transactional operations, before the
 * thread calls any other functions of the library.
 */
struct stm_tx *stm_init_thread(void) _CALLCONV;

//@{
/**
 * Clean up a transactional thread.  This function must be called once
 * from each thread that performs transactional operations, upon exit.
 */
void stm_exit_thread(void) _CALLCONV;
void stm_exit_thread_tx(struct stm_tx *tx) _CALLCONV;
//@}

//@{
/**
 * Start a transaction.
 *
 * @param attr
 *   Specifies optional attributes associated to the transaction.
 *   Attributes are copied in transaction-local storage.  If null, the
 *   transaction uses default attributes.
 * @return
 *   Environment (stack context) to be used to jump back upon abort.  It
 *   is the responsibility of the application to call sigsetjmp()
 *   immediately after starting the transaction.  If the transaction is
 *   nested, the function returns NULL and one should not call
 *   sigsetjmp() as an abort will restart the top-level transaction
 *   (flat nesting).
 */
sigjmp_buf *stm_start(stm_tx_attr_t attr) _CALLCONV;
sigjmp_buf *stm_start_tx(struct stm_tx *tx, stm_tx_attr_t attr) _CALLCONV;
//@}

//@{
/**
 * Try to commit a transaction.  If successful, the function returns 1.
 * Otherwise, execution continues at the point where sigsetjmp() has
 * been called after starting the outermost transaction (unless the
 * attributes indicate that the transaction should not retry).
 *
 * @return
 *   1 upon success, 0 otherwise.
 */
int stm_commit(void) _CALLCONV;
int stm_commit_tx(struct stm_tx *tx) _CALLCONV;
//@}

//@{
/**
 * Explicitly abort a transaction.  Execution continues at the point
 * where sigsetjmp() has been called after starting the outermost
 * transaction (unless the attributes indicate that the transaction
 * should not retry).
 *
 * @param abort_reason
 *   Reason for aborting the transaction.
 */
void stm_abort(int abort_reason) _CALLCONV;
void stm_abort_tx(struct stm_tx *tx, int abort_reason) _CALLCONV;
//@}

//@{
/**
 * Transactional load.  Read the specified memory location in the
 * context of the current transaction and return its value.  Upon
 * conflict, the transaction may abort while reading the memory
 * location.  Note that the value returned is consistent with respect to
 * previous reads from the same transaction.
 *
 * @param addr
 *   Address of the memory location.
 * @return
 *   Value read from the specified address.
 */
stm_word_t stm_load(volatile stm_word_t *addr) _CALLCONV;
stm_word_t stm_load_tx(struct stm_tx *tx, volatile stm_word_t *addr) _CALLCONV;
//@}

//@{
/**
 * Transactional store.  Write a word-sized value to the specified
 * memory location in the context of the current transaction.  Upon
 * conflict, the transaction may abort while writing to the memory
 * location.
 *
 * @param addr
 *   Address of the memory location.
 * @param value
 *   Value to be written.
 */
void stm_store(volatile stm_word_t *addr, stm_word_t value) _CALLCONV;
void stm_store_tx(struct stm_tx *tx, volatile stm_word_t *addr, stm_word_t value) _CALLCONV;
//@}

//@{
/**
 * Transactional store.  Write a value to the specified memory location
 * in the context of the current transaction.  The value may be smaller
 * than a word on the target architecture, in which case a mask is used
 * to indicate the bits of the words that must be updated.  Upon
 * conflict, the transaction may abort while writing to the memory
 * location.
 *
 * @param addr
 *   Address of the memory location.
 * @param value
 *   Value to be written.
 * @param mask
 *   Mask specifying the bits to be written.
 */
void stm_store2(volatile stm_word_t *addr, stm_word_t value, stm_word_t mask) _CALLCONV;
void stm_store2_tx(struct stm_tx *tx, volatile stm_word_t *addr, stm_word_t value, stm_word_t mask) _CALLCONV;
//@}

//@{
/**
 * Check if the current transaction is still active.
 *
 * @return
 *   True (non-zero) if the transaction is active, false (zero) otherwise.
 */
int stm_active(void) _CALLCONV;
int stm_active_tx(struct stm_tx *tx) _CALLCONV;
//@}

//@{
/**
 * Check if the current transaction has aborted.
 *
 * @return
 *   True (non-zero) if the transaction has aborted, false (zero) otherwise.
 */
int stm_aborted(void) _CALLCONV;
int stm_aborted_tx(struct stm_tx *tx) _CALLCONV;
//@}

//@{
/**
 * Check if the current transaction is still active and in irrevocable
 * state.
 *
 * @return
 *   True (non-zero) if the transaction is active and irrevocable, false
 *   (zero) otherwise.
 */
int stm_irrevocable(void) _CALLCONV;
int stm_irrevocable_tx(struct stm_tx *tx) _CALLCONV;
//@}

//@{
/**
 * Check if the current transaction has been killed.
 *
 * @return
 *   True (non-zero) if the transaction has been killed, false (zero) otherwise.
 */
int stm_killed(void) _CALLCONV;
int stm_killed_tx(struct stm_tx *tx) _CALLCONV;
//@}

//@{
/**
 * Get the environment used by the current thread to jump back upon
 * abort.  This environment should be used when calling sigsetjmp()
 * before starting the transaction and passed as parameter to
 * stm_start().  If the current thread is already executing a
 * transaction, i.e., the new transaction will be nested, the function
 * returns NULL and one should not call sigsetjmp().
 *
 * @return
 *   The environment to use for saving the stack context, or NULL if the
 *   transaction is nested.
 */
sigjmp_buf *stm_get_env(void) _CALLCONV;
sigjmp_buf *stm_get_env_tx(struct stm_tx *tx) _CALLCONV;
//@}

//@{
/**
 * Get attributes associated with the current transactions, if any.
 * These attributes were passed as parameters when starting the
 * transaction.
 *
 * @return Attributes associated with the current transaction, or NULL
 *   if no attributes were specified when starting the transaction.
 */
stm_tx_attr_t stm_get_attributes(void) _CALLCONV;
stm_tx_attr_t stm_get_attributes_tx(struct stm_tx *tx) _CALLCONV;
//@}

//@{
/**
 * Get various statistics about the current thread/transaction.  See the
 * source code (stm.c) for a list of supported statistics.
 *
 * @param name
 *   Name of the statistics.
 * @param val
 *   Pointer to the variable that should hold the value of the
 *   statistics.
 * @return
 *   1 upon success, 0 otherwise.
 */
int stm_get_stats(const char *name, void *val) _CALLCONV;
int stm_get_stats_tx(struct stm_tx *tx, const char *name, void *val) _CALLCONV;
//@}

/**
 * Get various parameters of the STM library.  See the source code
 * (stm.c) for a list of supported parameters.
 *
 * @param name
 *   Name of the parameter.
 * @param val
 *   Pointer to the variable that should hold the value of the
 *   parameter.
 * @return
 *   1 upon success, 0 otherwise.
 */
int stm_get_parameter(const char *name, void *val) _CALLCONV;

/**
 * Set various parameters of the STM library.  See the source code
 * (stm.c) for a list of supported parameters.
 *
 * @param name
 *   Name of the parameter.
 * @param val
 *   Pointer to a variable that holds the new value of the parameter.
 * @return
 *   1 upon success, 0 otherwise.
 */
int stm_set_parameter(const char *name, void *val) _CALLCONV;

/**
 * Create a key to associate application-specific data to the current
 * thread/transaction.  This mechanism can be combined with callbacks to
 * write modules.
 *
 * @return
 *   The new key.
 */
int stm_create_specific(void) _CALLCONV;

//@{
/**
 * Get application-specific data associated to the current
 * thread/transaction and a given key.
 *
 * @param key
 *   Key designating the data to read.
 * @return
 *   Data stored under the given key.
 */
void *stm_get_specific(int key) _CALLCONV;
void *stm_get_specific_tx(struct stm_tx *tx, int key) _CALLCONV;
//@}

//@{
/**
 * Set application-specific data associated to the current
 * thread/transaction and a given key.
 *
 * @param key
 *   Key designating the data to read.
 * @param data
 *   Data to store under the given key.
 */
void stm_set_specific(int key, void *data) _CALLCONV;
void stm_set_specific_tx(struct stm_tx *tx, int key, void *data) _CALLCONV;
//@}

/**
 * Register application-specific callbacks that are triggered each time
 * particular events occur.
 *
 * @param on_thread_init
 *   Function called upon initialization of a transactional thread.
 * @param on_thread_exit
 *   Function called upon cleanup of a transactional thread.
 * @param on_start
 *   Function called upon start of a transaction.
 * @param on_precommit
 *   Function called before transaction try to commit.
 * @param on_commit
 *   Function called upon successful transaction commit.
 * @param on_abort
 *   Function called upon transaction abort.
 * @param arg
 *   Parameter to be passed to the callback functions.
 * @return
 *   1 if the callbacks have been successfully registered, 0 otherwise.
 */
int stm_register(void (*on_thread_init)(void *arg),
                 void (*on_thread_exit)(void *arg),
                 void (*on_start)(void *arg),
                 void (*on_precommit)(void *arg),
                 void (*on_commit)(void *arg),
                 void (*on_abort)(void *arg),
                 void *arg) _CALLCONV;

/**
 * Transaction-safe load.  Read the specified memory location outside of
 * the context of any transaction and return its value.  The operation
 * behaves as if executed in the context of a dedicated transaction
 * (i.e., it executes atomically and in isolation) that never aborts,
 * but may get delayed.
 *
 * @param addr Address of the memory location.

 * @param timestamp If non-null, the referenced variable is updated to
 *   hold the timestamp of the memory location being read.
 * @return
 *   Value read from the specified address.
 */
stm_word_t stm_unit_load(volatile stm_word_t *addr, stm_word_t *timestamp) _CALLCONV;

/**
 * Transaction-safe store.  Write a word-sized value to the specified
 * memory location outside of the context of any transaction.  The
 * operation behaves as if executed in the context of a dedicated
 * transaction (i.e., it executes atomically and in isolation) that
 * never aborts, but may get delayed.
 *
 * @param addr
 *   Address of the memory location.
 * @param value
 *   Value to be written.
 * @param timestamp If non-null and the timestamp in the referenced
 *   variable is smaller than that of the memory location being written,
 *   no data is actually written and the variable is updated to hold the
 *   more recent timestamp. If non-null and the timestamp in the
 *   referenced variable is not smaller than that of the memory location
 *   being written, the memory location is written and the variable is
 *   updated to hold the new timestamp.
 * @return
 *   1 if value has been written, 0 otherwise.
 */
int stm_unit_store(volatile stm_word_t *addr, stm_word_t value, stm_word_t *timestamp) _CALLCONV;

/**
 * Transaction-safe store.  Write a value to the specified memory
 * location outside of the context of any transaction.  The value may be
 * smaller than a word on the target architecture, in which case a mask
 * is used to indicate the bits of the words that must be updated.  The
 * operation behaves as if executed in the context of a dedicated
 * transaction (i.e., it executes atomically and in isolation) that
 * never aborts, but may get delayed.
 *
 * @param addr
 *   Address of the memory location.
 * @param value
 *   Value to be written.
 * @param mask
 *   Mask specifying the bits to be written.
 * @param timestamp If non-null and the timestamp in the referenced
 *   variable is smaller than that of the memory location being written,
 *   no data is actually written and the variable is updated to hold the
 *   more recent timestamp. If non-null and the timestamp in the
 *   referenced variable is not smaller than that of the memory location
 *   being written, the memory location is written and the variable is
 *   updated to hold the new timestamp.
 * @return
 *   1 if value has been written, 0 otherwise.
 */
int stm_unit_store2(volatile stm_word_t *addr, stm_word_t value, stm_word_t mask, stm_word_t *timestamp) _CALLCONV;

//@{
/**
 * Enable or disable snapshot extensions for the current transaction,
 * and optionally set an upper bound for the snapshot.  This function is
 * useful for implementing efficient algorithms with unit loads and
 * stores while preserving compatibility with with regular transactions.
 *
 * @param enable
 *   True (non-zero) to enable snapshot extensions, false (zero) to
 *   disable them.
 * @param timestamp
 *   If non-null and the timestamp in the referenced variable is smaller
 *   than the current upper bound of the snapshot, update the upper
 *   bound to the value of the referenced variable.
 */
void stm_set_extension(int enable, stm_word_t *timestamp) _CALLCONV;
void stm_set_extension_tx(struct stm_tx *tx, int enable, stm_word_t *timestamp) _CALLCONV;
//@}

/**
 * Read the current value of the global clock (used for timestamps).
 * This function is useful when programming with unit loads and stores.
 *
 * @return
 *   Value of the global clock.
 */
stm_word_t stm_get_clock(void) _CALLCONV;

//@{
/**
 * Enter irrevocable mode for the current transaction.  If successful,
 * the function returns 1.  Otherwise, it aborts and execution continues
 * at the point where sigsetjmp() has been called after starting the
 * outermost transaction (unless the attributes indicate that the
 * transaction should not retry).
 *
 * @param serial
 *   True (non-zero) for serial-irrevocable mode (no transaction can
 *   execute concurrently), false for parallel-irrevocable mode.
 * @return
 *   1 upon success, 0 otherwise.
 */
int stm_set_irrevocable(int serial) _CALLCONV;
int stm_set_irrevocable_tx(struct stm_tx *tx, int serial) _CALLCONV;
//@}

#ifdef __cplusplus
}
#endif

#endif /* _STM_H_ */


================================================
FILE: stms/tinystm/include/wrappers.h
================================================
/*
 * File:
 *   wrappers.h
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * Description:
 *   STM wrapper functions for different data types.
 *
 * Copyright (c) 2007-2014.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * This program has a dual license and can also be distributed
 * under the terms of the MIT license.
 */

/**
 * @file
 *   STM wrapper functions for different data types.  This library
 *   defines transactional loads/store functions for unsigned data types
 *   of various sizes and for basic C data types.
 * @author
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * @date
 *   2007-2014
 */

#ifndef _WRAPPERS_H_
# define _WRAPPERS_H_

# include <stdint.h>

# include "stm.h"

# ifdef __cplusplus
extern "C" {
# endif

/**
 * Transactional load of an unsigned 8-bit value.
 *
 * @param addr
 *   Address of the memory location.
 * @return
 *   Value read from the specified address.
 */
uint8_t stm_load_u8(volatile uint8_t *addr) _CALLCONV;

/**
 * Transactional load of an unsigned 16-bit value.
 *
 * @param addr
 *   Address of the memory location.
 * @return
 *   Value read from the specified address.
 */
uint16_t stm_load_u16(volatile uint16_t *addr) _CALLCONV;

/**
 * Transactional load of an unsigned 32-bit value.
 *
 * @param addr
 *   Address of the memory location.
 * @return
 *   Value read from the specified address.
 */
uint32_t stm_load_u32(volatile uint32_t *addr) _CALLCONV;

/**
 * Transactional load of an unsigned 64-bit value.
 *
 * @param addr
 *   Address of the memory location.
 * @return
 *   Value read from the specified address.
 */
uint64_t stm_load_u64(volatile uint64_t *addr) _CALLCONV;

/**
 * Transactional load of a char value.
 *
 * @param addr
 *   Address of the memory location.
 * @return
 *   Value read from the specified address.
 */
char stm_load_char(volatile char *addr) _CALLCONV;

/**
 * Transactional load of an unsigned char value.
 *
 * @param addr
 *   Address of the memory location.
 * @return
 *   Value read from the specified address.
 */
unsigned char stm_load_uchar(volatile unsigned char *addr) _CALLCONV;

/**
 * Transactional load of a short value.
 *
 * @param addr
 *   Address of the memory location.
 * @return
 *   Value read from the specified address.
 */
short stm_load_short(volatile short *addr) _CALLCONV;

/**
 * Transactional load of an unsigned short value.
 *
 * @param addr
 *   Address of the memory location.
 * @return
 *   Value read from the specified address.
 */
unsigned short stm_load_ushort(volatile unsigned short *addr) _CALLCONV;

/**
 * Transactional load of an int value.
 *
 * @param addr
 *   Address of the memory location.
 * @return
 *   Value read from the specified address.
 */
int stm_load_int(volatile int *addr) _CALLCONV;

/**
 * Transactional load of an unsigned int value.
 *
 * @param addr
 *   Address of the memory location.
 * @return
 *   Value read from the specified address.
 */
unsigned int stm_load_uint(volatile unsigned int *addr) _CALLCONV;

/**
 * Transactional load of a long value.
 *
 * @param addr
 *   Address of the memory location.
 * @return
 *   Value read from the specified address.
 */
long stm_load_long(volatile long *addr) _CALLCONV;

/**
 * Transactional load of an unsigned long value.
 *
 * @param addr
 *   Address of the memory location.
 * @return
 *   Value read from the specified address.
 */
unsigned long stm_load_ulong(volatile unsigned long *addr) _CALLCONV;

/**
 * Transactional load of a float value.
 *
 * @param addr
 *   Address of the memory location.
 * @return
 *   Value read from the specified address.
 */
float stm_load_float(volatile float *addr) _CALLCONV;

/**
 * Transactional load of a double value.
 *
 * @param addr
 *   Address of the memory location.
 * @return
 *   Value read from the specified address.
 */
double stm_load_double(volatile double *addr) _CALLCONV;

/**
 * Transactional load of a pointer value.
 *
 * @param addr
 *   Address of the memory location.
 * @return
 *   Value read from the specified address.
 */
void *stm_load_ptr(volatile void **addr) _CALLCONV;

/**
 * Transactional load of a memory region.  The address of the region
 * does not need to be word aligned and its size may be longer than a
 * word.  The values are copied into the provided buffer, which must be
 * allocated by the caller.
 *
 * @param addr
 *   Address of the memory location.
 * @param buf
 *   Buffer for storing the read bytes.
 * @param size
 *   Number of bytes to read.
 */
void stm_load_bytes(volatile uint8_t *addr, uint8_t *buf, size_t size) _CALLCONV;

/**
 * Transactional store of an unsigned 8-bit value.
 *
 * @param addr
 *   Address of the memory location.
 * @param value
 *   Value to be written.
 */
void stm_store_u8(volatile uint8_t *addr, uint8_t value) _CALLCONV;

/**
 * Transactional store of an unsigned 16-bit value.
 *
 * @param addr
 *   Address of the memory location.
 * @param value
 *   Value to be written.
 */
void stm_store_u16(volatile uint16_t *addr, uint16_t value) _CALLCONV;

/**
 * Transactional store of an unsigned 32-bit value.
 *
 * @param addr
 *   Address of the memory location.
 * @param value
 *   Value to be written.
 */
void stm_store_u32(volatile uint32_t *addr, uint32_t value) _CALLCONV;

/**
 * Transactional store of an unsigned 64-bit value.
 *
 * @param addr
 *   Address of the memory location.
 * @param value
 *   Value to be written.
 */
void stm_store_u64(volatile uint64_t *addr, uint64_t value) _CALLCONV;

/**
 * Transactional store of a char value.
 *
 * @param addr
 *   Address of the memory location.
 * @param value
 *   Value to be written.
 */
void stm_store_char(volatile char *addr, char value) _CALLCONV;

/**
 * Transactional store of an unsigned char value.
 *
 * @param addr
 *   Address of the memory location.
 * @param value
 *   Value to be written.
 */
void stm_store_uchar(volatile unsigned char *addr, unsigned char value) _CALLCONV;

/**
 * Transactional store of a short value.
 *
 * @param addr
 *   Address of the memory location.
 * @param value
 *   Value to be written.
 */
void stm_store_short(volatile short *addr, short value) _CALLCONV;

/**
 * Transactional store of an unsigned short value.
 *
 * @param addr
 *   Address of the memory location.
 * @param value
 *   Value to be written.
 */
void stm_store_ushort(volatile unsigned short *addr, unsigned short value) _CALLCONV;

/**
 * Transactional store of an int value.
 *
 * @param addr
 *   Address of the memory location.
 * @param value
 *   Value to be written.
 */
void stm_store_int(volatile int *addr, int value) _CALLCONV;

/**
 * Transactional store of an unsigned int value.
 *
 * @param addr
 *   Address of the memory location.
 * @param value
 *   Value to be written.
 */
void stm_store_uint(volatile unsigned int *addr, unsigned int value) _CALLCONV;

/**
 * Transactional store of a long value.
 *
 * @param addr
 *   Address of the memory location.
 * @param value
 *   Value to be written.
 */
void stm_store_long(volatile long *addr, long value) _CALLCONV;

/**
 * Transactional store of an unsigned long value.
 *
 * @param addr
 *   Address of the memory location.
 * @param value
 *   Value to be written.
 */
void stm_store_ulong(volatile unsigned long *addr, unsigned long value) _CALLCONV;

/**
 * Transactional store of a float value.
 *
 * @param addr
 *   Address of the memory location.
 * @param value
 *   Value to be written.
 */
void stm_store_float(volatile float *addr, float value) _CALLCONV;

/**
 * Transactional store of a double value.
 *
 * @param addr
 *   Address of the memory location.
 * @param value
 *   Value to be written.
 */
void stm_store_double(volatile double *addr, double value) _CALLCONV;

/**
 * Transactional store of a pointer value.
 *
 * @param addr
 *   Address of the memory location.
 * @param value
 *   Value to be written.
 */
void stm_store_ptr(volatile void **addr, void *value) _CALLCONV;

/**
 * Transactional store of a memory region.  The address of the region
 * does not need to be word aligned and its size may be longer than a
 * word.  The values are copied from the provided buffer.
 *
 * @param addr
 *   Address of the memory location.
 * @param buf
 *   Buffer with the bytes to write.
 * @param size
 *   Number of bytes to write.
 */
void stm_store_bytes(volatile uint8_t *addr, uint8_t *buf, size_t size) _CALLCONV;

/**
 * Transactional write of a byte to a memory region.  The address of the
 * region does not need to be word aligned and its size may be longer
 * than a word.  The provided byte is repeatedly copied to the whole
 * memory region.
 *
 * @param addr
 *   Address of the memory location.
 * @param byte
 *   Byte to write.
 * @param count
 *   Number of bytes to write.
 */
void stm_set_bytes(volatile uint8_t *addr, uint8_t byte, size_t count) _CALLCONV;

# ifdef __cplusplus
}
# endif

#endif /* _WRAPPERS_H_ */


================================================
FILE: stms/tinystm/lib/.gitignore
================================================
# Ignore everything in this directory
*
# Except this file
!.gitignore


================================================
FILE: stms/tinystm/src/.gitignore
================================================
/mod_ab.o
/mod_cb_mem.o
/mod_log.o
/mod_order.o
/mod_print.o
/mod_stats.o
/stm.o
/wrappers.o


================================================
FILE: stms/tinystm/src/atomic.h
================================================
/*
 * File:
 *   atomic.h
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * Description:
 *   Atomic operations.
 *
 * Copyright (c) 2007-2014.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * This program has a dual license and can also be distributed
 * under the terms of the MIT license.
 */

#ifndef _ATOMIC_H_
# define _ATOMIC_H_

# ifdef ATOMIC_BUILTIN
typedef volatile size_t atomic_t;
#  ifdef __INTEL_COMPILER
#   define ATOMIC_CB                     __memory_barrier()
#  else /* ! __INTEL_COMPILER, assuming __GNUC__ */
#   define ATOMIC_CB                     __asm__ __volatile__("": : :"memory")
#  endif /* ! __INTEL_COMPILER */
#  ifndef UNSAFE
#   warning "This is experimental and shouldn't be used"
/*
   Note: __sync_ is available for GCC 4.2+ and ICC 11.1+
   But these definitions are not 100% safe:
    * need 'a' to be volatile
    * no fence for read/store proposed (only full fence)
   C11 and C++11 also propose atomic operations.
*/
#   define ATOMIC_CAS_FULL(a, e, v)      (__sync_bool_compare_and_swap(a, e, v))
#   define ATOMIC_FETCH_INC_FULL(a)      (__sync_fetch_and_add(a, 1))
#   define ATOMIC_FETCH_DEC_FULL(a)      (__sync_fetch_and_add(a, -1))
#   define ATOMIC_FETCH_ADD_FULL(a, v)   (__sync_fetch_and_add(a, v))
#   define ATOMIC_LOAD_ACQ(a)            (*(a))
#   define ATOMIC_LOAD(a)                (*(a))
#   define ATOMIC_STORE_REL(a, v)        (*(a) = (v))
#   define ATOMIC_STORE(a, v)            (*(a) = (v))
#   define ATOMIC_MB_READ                /* Nothing */
#   define ATOMIC_MB_WRITE               /* Nothing */
#   define ATOMIC_MB_FULL                __sync_synchronize()
#  else
/* Use only for testing purposes (single thread benchmarks) */
#   define ATOMIC_CAS_FULL(a, e, v)      (*(a) = (v), 1)
#   define ATOMIC_FETCH_INC_FULL(a)      ((*(a))++)
#   define ATOMIC_FETCH_DEC_FULL(a)      ((*(a))--)
#   define ATOMIC_FETCH_ADD_FULL(a, v)   ((*(a)) += (v))
#   define ATOMIC_LOAD_ACQ(a)            (*(a))
#   define ATOMIC_LOAD(a)                (*(a))
#   define ATOMIC_STORE_REL(a, v)        (*(a) = (v))
#   define ATOMIC_STORE(a, v)            (*(a) = (v))
#   define ATOMIC_MB_READ                /* Nothing */
#   define ATOMIC_MB_WRITE               /* Nothing */
#   define ATOMIC_MB_FULL                /* Nothing */
#  endif /* UNSAFE */

# else /* ! ATOMIC_BUILTIN */
/* NOTE: enable fence instructions for i386 and amd64 but the mfence instructions seems costly. */
/* # define AO_USE_PENTIUM4_INSTRS */
#  include <atomic_ops.h>
typedef AO_t atomic_t;
#  define ATOMIC_CB                     AO_compiler_barrier()
#  define ATOMIC_CAS_FULL(a, e, v)      (AO_compare_and_swap_full((volatile AO_t *)(a), (AO_t)(e), (AO_t)(v)))
#  define ATOMIC_FETCH_INC_FULL(a)      (AO_fetch_and_add1_full((volatile AO_t *)(a)))
#  define ATOMIC_FETCH_DEC_FULL(a)      (AO_fetch_and_sub1_full((volatile AO_t *)(a)))
#  define ATOMIC_FETCH_ADD_FULL(a, v)   (AO_fetch_and_add_full((volatile AO_t *)(a), (AO_t)(v)))
#  ifdef SAFE
#   define ATOMIC_LOAD_ACQ(a)           (AO_load_full((volatile AO_t *)(a)))
#   define ATOMIC_LOAD(a)               (AO_load_full((volatile AO_t *)(a)))
#   define ATOMIC_STORE_REL(a, v)       (AO_store_full((volatile AO_t *)(a), (AO_t)(v)))
#   define ATOMIC_STORE(a, v)           (AO_store_full((volatile AO_t *)(a), (AO_t)(v)))
#   define ATOMIC_MB_READ               AO_nop_full()
#   define ATOMIC_MB_WRITE              AO_nop_full()
#   define ATOMIC_MB_FULL               AO_nop_full()
#  else /* ! SAFE */
#   define ATOMIC_LOAD_ACQ(a)           (AO_load_acquire_read((volatile AO_t *)(a)))
#   define ATOMIC_LOAD(a)               (*((volatile AO_t *)(a)))
#   define ATOMIC_STORE_REL(a, v)       (AO_store_release((volatile AO_t *)(a), (AO_t)(v)))
#   define ATOMIC_STORE(a, v)           (*((volatile AO_t *)(a)) = (AO_t)(v))
#   define ATOMIC_MB_READ               AO_nop_read()
#   define ATOMIC_MB_WRITE              AO_nop_write()
#   define ATOMIC_MB_FULL               AO_nop_full()
#  endif /* ! SAFE */
# endif /* ! NO_AO */

#endif /* _ATOMIC_H_ */


================================================
FILE: stms/tinystm/src/atomic_ops/AUTHORS
================================================
Originally written by Hans Boehm, with some platform-dependent code
imported from the Boehm-Demers-Weiser GC, where it was contributed
by many others.


================================================
FILE: stms/tinystm/src/atomic_ops/COPYING
================================================
		    GNU GENERAL PUBLIC LICENSE
		       Version 2, June 1991

 Copyright (C) 1989, 1991 Free Software Foundation, Inc.
     59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 Everyone is permitted to copy and distribute verbatim copies
 of this license document, but changing it is not allowed.

			    Preamble

  The licenses for most software are designed to take away your
freedom to share and change it.  By contrast, the GNU General Public
License is intended to guarantee your freedom to share and change free
software--to make sure the software is free for all its users.  This
General Public License applies to most of the Free Software
Foundation's software and to any other program whose authors commit to
using it.  (Some other Free Software Foundation software is covered by
the GNU Library General Public License instead.)  You can apply it to
your programs, too.

  When we speak of free software, we are referring to freedom, not
price.  Our General Public Licenses are designed to make sure that you
have the freedom to distribute copies of free software (and charge for
this service if you wish), that you receive source code or can get it
if you want it, that you can change the software or use pieces of it
in new free programs; and that you know you can do these things.

  To protect your rights, we need to make restrictions that forbid
anyone to deny you these rights or to ask you to surrender the rights.
These restrictions translate to certain responsibilities for you if you
distribute copies of the software, or if you modify it.

  For example, if you distribute copies of such a program, whether
gratis or for a fee, you must give the recipients all the rights that
you have.  You must make sure that they, too, receive or can get the
source code.  And you must show them these terms so they know their
rights.

  We protect your rights with two steps: (1) copyright the software, and
(2) offer you this license which gives you legal permission to copy,
distribute and/or modify the software.

  Also, for each author's protection and ours, we want to make certain
that everyone understands that there is no warranty for this free
software.  If the software is modified by someone else and passed on, we
want its recipients to know that what they have is not the original, so
that any problems introduced by others will not reflect on the original
authors' reputations.

  Finally, any free program is threatened constantly by software
patents.  We wish to avoid the danger that redistributors of a free
program will individually obtain patent licenses, in effect making the
program proprietary.  To prevent this, we have made it clear that any
patent must be licensed for everyone's free use or not licensed at all.

  The precise terms and conditions for copying, distribution and
modification follow.

		    GNU GENERAL PUBLIC LICENSE
   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION

  0. This License applies to any program or other work which contains
a notice placed by the copyright holder saying it may be distributed
under the terms of this General Public License.  The "Program", below,
refers to any such program or work, and a "work based on the Program"
means either the Program or any derivative work under copyright law:
that is to say, a work containing the Program or a portion of it,
either verbatim or with modifications and/or translated into another
language.  (Hereinafter, translation is included without limitation in
the term "modification".)  Each licensee is addressed as "you".

Activities other than copying, distribution and modification are not
covered by this License; they are outside its scope.  The act of
running the Program is not restricted, and the output from the Program
is covered only if its contents constitute a work based on the
Program (independent of having been made by running the Program).
Whether that is true depends on what the Program does.

  1. You may copy and distribute verbatim copies of the Program's
source code as you receive it, in any medium, provided that you
conspicuously and appropriately publish on each copy an appropriate
copyright notice and disclaimer of warranty; keep intact all the
notices that refer to this License and to the absence of any warranty;
and give any other recipients of the Program a copy of this License
along with the Program.

You may charge a fee for the physical act of transferring a copy, and
you may at your option offer warranty protection in exchange for a fee.

  2. You may modify your copy or copies of the Program or any portion
of it, thus forming a work based on the Program, and copy and
distribute such modifications or work under the terms of Section 1
above, provided that you also meet all of these conditions:

    a) You must cause the modified files to carry prominent notices
    stating that you changed the files and the date of any change.

    b) You must cause any work that you distribute or publish, that in
    whole or in part contains or is derived from the Program or any
    part thereof, to be licensed as a whole at no charge to all third
    parties under the terms of this License.

    c) If the modified program normally reads commands interactively
    when run, you must cause it, when started running for such
    interactive use in the most ordinary way, to print or display an
    announcement including an appropriate copyright notice and a
    notice that there is no warranty (or else, saying that you provide
    a warranty) and that users may redistribute the program under
    these conditions, and telling the user how to view a copy of this
    License.  (Exception: if the Program itself is interactive but
    does not normally print such an announcement, your work based on
    the Program is not required to print an announcement.)

These requirements apply to the modified work as a whole.  If
identifiable sections of that work are not derived from the Program,
and can be reasonably considered independent and separate works in
themselves, then this License, and its terms, do not apply to those
sections when you distribute them as separate works.  But when you
distribute the same sections as part of a whole which is a work based
on the Program, the distribution of the whole must be on the terms of
this License, whose permissions for other licensees extend to the
entire whole, and thus to each and every part regardless of who wrote it.

Thus, it is not the intent of this section to claim rights or contest
your rights to work written entirely by you; rather, the intent is to
exercise the right to control the distribution of derivative or
collective works based on the Program.

In addition, mere aggregation of another work not based on the Program
with the Program (or with a work based on the Program) on a volume of
a storage or distribution medium does not bring the other work under
the scope of this License.

  3. You may copy and distribute the Program (or a work based on it,
under Section 2) in object code or executable form under the terms of
Sections 1 and 2 above provided that you also do one of the following:

    a) Accompany it with the complete corresponding machine-readable
    source code, which must be distributed under the terms of Sections
    1 and 2 above on a medium customarily used for software interchange; or,

    b) Accompany it with a written offer, valid for at least three
    years, to give any third party, for a charge no more than your
    cost of physically performing source distribution, a complete
    machine-readable copy of the corresponding source code, to be
    distributed under the terms of Sections 1 and 2 above on a medium
    customarily used for software interchange; or,

    c) Accompany it with the information you received as to the offer
    to distribute corresponding source code.  (This alternative is
    allowed only for noncommercial distribution and only if you
    received the program in object code or executable form with such
    an offer, in accord with Subsection b above.)

The source code for a work means the preferred form of the work for
making modifications to it.  For an executable work, complete source
code means all the source code for all modules it contains, plus any
associated interface definition files, plus the scripts used to
control compilation and installation of the executable.  However, as a
special exception, the source code distributed need not include
anything that is normally distributed (in either source or binary
form) with the major components (compiler, kernel, and so on) of the
operating system on which the executable runs, unless that component
itself accompanies the executable.

If distribution of executable or object code is made by offering
access to copy from a designated place, then offering equivalent
access to copy the source code from the same place counts as
distribution of the source code, even though third parties are not
compelled to copy the source along with the object code.

  4. You may not copy, modify, sublicense, or distribute the Program
except as expressly provided under this License.  Any attempt
otherwise to copy, modify, sublicense or distribute the Program is
void, and will automatically terminate your rights under this License.
However, parties who have received copies, or rights, from you under
this License will not have their licenses terminated so long as such
parties remain in full compliance.

  5. You are not required to accept this License, since you have not
signed it.  However, nothing else grants you permission to modify or
distribute the Program or its derivative works.  These actions are
prohibited by law if you do not accept this License.  Therefore, by
modifying or distributing the Program (or any work based on the
Program), you indicate your acceptance of this License to do so, and
all its terms and conditions for copying, distributing or modifying
the Program or works based on it.

  6. Each time you redistribute the Program (or any work based on the
Program), the recipient automatically receives a license from the
original licensor to copy, distribute or modify the Program subject to
these terms and conditions.  You may not impose any further
restrictions on the recipients' exercise of the rights granted herein.
You are not responsible for enforcing compliance by third parties to
this License.

  7. If, as a consequence of a court judgment or allegation of patent
infringement or for any other reason (not limited to patent issues),
conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License.  If you cannot
distribute so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you
may not distribute the Program at all.  For example, if a patent
license would not permit royalty-free redistribution of the Program by
all those who receive copies directly or indirectly through you, then
the only way you could satisfy both it and this License would be to
refrain entirely from distribution of the Program.

If any portion of this section is held invalid or unenforceable under
any particular circumstance, the balance of the section is intended to
apply and the section as a whole is intended to apply in other
circumstances.

It is not the purpose of this section to induce you to infringe any
patents or other property right claims or to contest validity of any
such claims; this section has the sole purpose of protecting the
integrity of the free software distribution system, which is
implemented by public license practices.  Many people have made
generous contributions to the wide range of software distributed
through that system in reliance on consistent application of that
system; it is up to the author/donor to decide if he or she is willing
to distribute software through any other system and a licensee cannot
impose that choice.

This section is intended to make thoroughly clear what is believed to
be a consequence of the rest of this License.

  8. If the distribution and/or use of the Program is restricted in
certain countries either by patents or by copyrighted interfaces, the
original copyright holder who places the Program under this License
may add an explicit geographical distribution limitation excluding
those countries, so that distribution is permitted only in or among
countries not thus excluded.  In such case, this License incorporates
the limitation as if written in the body of this License.

  9. The Free Software Foundation may publish revised and/or new versions
of the General Public License from time to time.  Such new versions will
be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.

Each version is given a distinguishing version number.  If the Program
specifies a version number of this License which applies to it and "any
later version", you have the option of following the terms and conditions
either of that version or of any later version published by the Free
Software Foundation.  If the Program does not specify a version number of
this License, you may choose any version ever published by the Free Software
Foundation.

  10. If you wish to incorporate parts of the Program into other free
programs whose distribution conditions are different, write to the author
to ask for permission.  For software which is copyrighted by the Free
Software Foundation, write to the Free Software Foundation; we sometimes
make exceptions for this.  Our decision will be guided by the two goals
of preserving the free status of all derivatives of our free software and
of promoting the sharing and reuse of software generally.

			    NO WARRANTY

  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
REPAIR OR CORRECTION.

  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
POSSIBILITY OF SUCH DAMAGES.

		     END OF TERMS AND CONDITIONS

	    How to Apply These Terms to Your New Programs

  If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these terms.

  To do so, attach the following notices to the program.  It is safest
to attach them to the start of each source file to most effectively
convey the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.

    <one line to give the program's name and a brief idea of what it does.>
    Copyright (C) <year>  <name of author>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA


Also add information on how to contact you by electronic and paper mail.

If the program is interactive, make it output a short notice like this
when it starts in an interactive mode:

    Gnomovision version 69, Copyright (C) year  name of author
    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
    This is free software, and you are welcome to redistribute it
    under certain conditions; type `show c' for details.

The hypothetical commands `show w' and `show c' should show the appropriate
parts of the General Public License.  Of course, the commands you use may
be called something other than `show w' and `show c'; they could even be
mouse-clicks or menu items--whatever suits your program.

You should also get your employer (if you work as a programmer) or your
school, if any, to sign a "copyright disclaimer" for the program, if
necessary.  Here is a sample; alter the names:

  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
  `Gnomovision' (which makes passes at compilers) written by James Hacker.

  <signature of Ty Coon>, 1 April 1989
  Ty Coon, President of Vice

This General Public License does not permit incorporating your program into
proprietary programs.  If your program is a subroutine library, you may
consider it more useful to permit linking proprietary applications with the
library.  If this is what you want to do, use the GNU Library General
Public License instead of this License.


================================================
FILE: stms/tinystm/src/atomic_ops/README
================================================
This directory contains a stripped-down (support only gcc) version of libatomic_ops by Hans Boehm.
The official release is available from http://www.hpl.hp.com/research/linux/atomic_ops/.


================================================
FILE: stms/tinystm/src/atomic_ops/aligned_atomic_load_store.h
================================================
/*
 * Copyright (c) 2003 Hewlett-Packard Development Company, L.P.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

/*
 * Definitions for architectures on which loads and stores of AO_t are
 * atomic fo all legal alignments.
 */

AO_INLINE AO_t
AO_load(const volatile AO_t *addr)
{
  assert(((size_t)addr & (sizeof(AO_t) - 1)) == 0);
  /* Cast away the volatile for architectures where             */
  /* volatile adds barrier semantics.                           */
  return *(AO_t *)addr;
}

#define AO_HAVE_load

AO_INLINE void
AO_store(volatile AO_t *addr, AO_t new_val)
{
  assert(((size_t)addr & (sizeof(AO_t) - 1)) == 0);
  (*(AO_t *)addr) = new_val;
}

#define AO_HAVE_store


================================================
FILE: stms/tinystm/src/atomic_ops/all_acquire_release_volatile.h
================================================
/*
 * Copyright (c) 2004 Hewlett-Packard Development Company, L.P.
 * 
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE. 
 */

/*
 * Describes architectures on which volatile AO_t, unsigned char, unsigned
 * short, and unsigned int loads and stores have acquire/release semantics for
 * all normally legal alignments.
 */
//#include "acquire_release_volatile.h"
//#include "char_acquire_release_volatile.h"
//#include "short_acquire_release_volatile.h"
//#include "int_acquire_release_volatile.h"

/*
 * This file adds definitions appropriate for environments in which an AO_t
 * volatile load has acquire semantics, and an AO_t volatile store has release
 * semantics.  This is arguably supposed to be true with the standard Itanium
 * software conventions.
 */

/*
 * Empirically gcc/ia64 does some reordering of ordinary operations around volatiles
 * even when we think it shouldn't.  Gcc 3.3 and earlier could reorder a volatile store
 * with another store.  As of March 2005, gcc pre-4 reused previously computed
 * common subexpressions across a volatile load.
 * Hence we now add compiler barriers for gcc.
 */
#if !defined(AO_GCC_BARRIER)
#  if defined(__GNUC__)
#    define AO_GCC_BARRIER() AO_compiler_barrier()
#  else
#    define AO_GCC_BARRIER()
#  endif
#endif

AO_INLINE AO_t
AO_load_acquire(const volatile AO_t *p)
{
  AO_t result = *p;
  /* A normal volatile load generates an ld.acq         */
  AO_GCC_BARRIER();
  return result;
}
#define AO_HAVE_load_acquire

AO_INLINE void
AO_store_release(volatile AO_t *p, AO_t val)
{
  AO_GCC_BARRIER();
  /* A normal volatile store generates an st.rel        */
  *p = val;
}
#define AO_HAVE_store_release

/*
 * This file adds definitions appropriate for environments in which an unsigned char
 * volatile load has acquire semantics, and an unsigned char volatile store has release
 * semantics.  This is true with the standard Itanium ABI.
 */
#if !defined(AO_GCC_BARRIER)
#  if defined(__GNUC__)
#    define AO_GCC_BARRIER() AO_compiler_barrier()
#  else
#    define AO_GCC_BARRIER()
#  endif
#endif

AO_INLINE unsigned char
AO_char_load_acquire(const volatile unsigned char *p)
{
  unsigned char result = *p;
  /* A normal volatile load generates an ld.acq         */
  AO_GCC_BARRIER();
  return result;
}
#define AO_HAVE_char_load_acquire

AO_INLINE void
AO_char_store_release(volatile unsigned char *p, unsigned char val)
{
  AO_GCC_BARRIER();
  /* A normal volatile store generates an st.rel        */
  *p = val;
}
#define AO_HAVE_char_store_release

/*
 * This file adds definitions appropriate for environments in which an unsigned short
 * volatile load has acquire semantics, and an unsigned short volatile store has release
 * semantics.  This is true with the standard Itanium ABI.
 */
#if !defined(AO_GCC_BARRIER)
#  if defined(__GNUC__)
#    define AO_GCC_BARRIER() AO_compiler_barrier()
#  else
#    define AO_GCC_BARRIER()
#  endif
#endif

AO_INLINE unsigned short
AO_short_load_acquire(const volatile unsigned short *p)
{
  unsigned short result = *p;
  /* A normal volatile load generates an ld.acq         */
  AO_GCC_BARRIER();
  return result;
}
#define AO_HAVE_short_load_acquire

AO_INLINE void
AO_short_store_release(volatile unsigned short *p, unsigned short val)
{
  AO_GCC_BARRIER();
  /* A normal volatile store generates an st.rel        */
  *p = val;
}
#define AO_HAVE_short_store_release

/*
 * This file adds definitions appropriate for environments in which an unsigned
 * int volatile load has acquire semantics, and an unsigned short volatile
 * store has release semantics.  This is true with the standard Itanium ABI.
 */
#if !defined(AO_GCC_BARRIER)
#  if defined(__GNUC__)
#    define AO_GCC_BARRIER() AO_compiler_barrier()
#  else
#    define AO_GCC_BARRIER()
#  endif
#endif

AO_INLINE unsigned int
AO_int_load_acquire(const volatile unsigned int *p)
{
  unsigned int result = *p;
  /* A normal volatile load generates an ld.acq         */
  AO_GCC_BARRIER();
  return result;
}
#define AO_HAVE_int_load_acquire

AO_INLINE void
AO_int_store_release(volatile unsigned int *p, unsigned int val)
{
  AO_GCC_BARRIER();
  /* A normal volatile store generates an st.rel        */
  *p = val;
}
#define AO_HAVE_int_store_release


================================================
FILE: stms/tinystm/src/atomic_ops/ao_t_is_int.h
================================================
/*
 * Copyright (c) 2003-2004 Hewlett-Packard Development Company, L.P.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

/*
 * Inclusion of this file signifies that AO_t is in fact int.  Hence
 * any AO_... operations can also server as AO_int_... operations.
 * We currently define only the more important ones here, and allow for
 * the normal generalization process to define the others.
 * We should probably add others in the future.
 */

#if defined(AO_HAVE_compare_and_swap_full) && \
    !defined(AO_HAVE_int_compare_and_swap_full)
#  define AO_int_compare_and_swap_full(addr, old, new_val) \
                AO_compare_and_swap_full((volatile AO_t *)(addr), \
                                        (AO_t)(old), (AO_t)(new_val))
#  define AO_HAVE_int_compare_and_swap_full
# endif

#if defined(AO_HAVE_compare_and_swap_acquire) && \
    !defined(AO_HAVE_int_compare_and_swap_acquire)
#  define AO_int_compare_and_swap_acquire(addr, old, new_val) \
                AO_compare_and_swap_acquire((volatile AO_t *)(addr), \
                                            (AO_t)(old), (AO_t)(new_val))
#  define AO_HAVE_int_compare_and_swap_acquire
# endif

#if defined(AO_HAVE_compare_and_swap_release) && \
    !defined(AO_HAVE_int_compare_and_swap_release)
#  define AO_int_compare_and_swap_release(addr, old, new_val) \
                AO_compare_and_swap_release((volatile AO_t *)(addr), \
                                         (AO_t)(old), (AO_t)(new_val))
#  define AO_HAVE_int_compare_and_swap_release
# endif

#if defined(AO_HAVE_compare_and_swap_write) && \
    !defined(AO_HAVE_int_compare_and_swap_write)
#  define AO_int_compare_and_swap_write(addr, old, new_val) \
                AO_compare_and_swap_write((volatile AO_t *)(addr), \
                                          (AO_t)(old), (AO_t)(new_val))
#  define AO_HAVE_int_compare_and_swap_write
# endif

#if defined(AO_HAVE_compare_and_swap_read) && \
    !defined(AO_HAVE_int_compare_and_swap_read)
#  define AO_int_compare_and_swap_read(addr, old, new_val) \
                AO_compare_and_swap_read((volatile AO_t *)(addr), \
                                         (AO_t)(old), (AO_t)(new_val))
#  define AO_HAVE_int_compare_and_swap_read
# endif

#if defined(AO_HAVE_compare_and_swap) && \
    !defined(AO_HAVE_int_compare_and_swap)
#  define AO_int_compare_and_swap(addr, old, new_val) \
                AO_compare_and_swap((volatile AO_t *)(addr), \
                                    (AO_t)(old), (AO_t)(new_val))
#  define AO_HAVE_int_compare_and_swap
# endif

#if defined(AO_HAVE_load_acquire) && \
    !defined(AO_HAVE_int_load_acquire)
#  define AO_int_load_acquire(addr) \
        (int)AO_load_acquire((const volatile AO_t *)(addr))
#  define AO_HAVE_int_load_acquire
# endif

#if defined(AO_HAVE_store_release) && \
    !defined(AO_HAVE_int_store_release)
#  define AO_int_store_release(addr, val) \
        AO_store_release((volatile AO_t *)(addr), (AO_t)(val))
#  define AO_HAVE_int_store_release
# endif

#if defined(AO_HAVE_fetch_and_add_full) && \
    !defined(AO_HAVE_int_fetch_and_add_full)
#  define AO_int_fetch_and_add_full(addr, incr) \
        (int)AO_fetch_and_add_full((volatile AO_t *)(addr), (AO_t)(incr))
#  define AO_HAVE_int_fetch_and_add_full
# endif

#if defined(AO_HAVE_fetch_and_add1_acquire) && \
    !defined(AO_HAVE_int_fetch_and_add1_acquire)
#  define AO_int_fetch_and_add1_acquire(addr) \
        (int)AO_fetch_and_add1_acquire((volatile AO_t *)(addr))
#  define AO_HAVE_int_fetch_and_add1_acquire
# endif

#if defined(AO_HAVE_fetch_and_add1_release) && \
    !defined(AO_HAVE_int_fetch_and_add1_release)
#  define AO_int_fetch_and_add1_release(addr) \
        (int)AO_fetch_and_add1_release((volatile AO_t *)(addr))
#  define AO_HAVE_int_fetch_and_add1_release
# endif

#if defined(AO_HAVE_fetch_and_sub1_acquire) && \
    !defined(AO_HAVE_int_fetch_and_sub1_acquire)
#  define AO_int_fetch_and_sub1_acquire(addr) \
        (int)AO_fetch_and_sub1_acquire((volatile AO_t *)(addr))
#  define AO_HAVE_int_fetch_and_sub1_acquire
# endif

#if defined(AO_HAVE_fetch_and_sub1_release) && \
    !defined(AO_HAVE_int_fetch_and_sub1_release)
#  define AO_int_fetch_and_sub1_release(addr) \
        (int)AO_fetch_and_sub1_release((volatile AO_t *)(addr))
#  define AO_HAVE_int_fetch_and_sub1_release
# endif


================================================
FILE: stms/tinystm/src/atomic_ops/atomic_ops.h
================================================
/*
 * Copyright (c) 2003 Hewlett-Packard Development Company, L.P.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#ifndef ATOMIC_OPS_H

#define ATOMIC_OPS_H

#include <assert.h>
#include <stddef.h>

/* We define various atomic operations on memory in a           */
/* machine-specific way.  Unfortunately, this is complicated    */
/* by the fact that these may or may not be combined with       */
/* various memory barriers.  Thus the actual operations we      */
/* define have the form AO_<atomic-op>_<barrier>, for all       */
/* plausible combinations of <atomic-op> and <barrier>.         */
/* This of course results in a mild combinatorial explosion.    */
/* To deal with it, we try to generate derived                  */
/* definitions for as many of the combinations as we can, as    */
/* automatically as possible.                                   */
/*                                                              */
/* Our assumption throughout is that the programmer will        */
/* specify the least demanding operation and memory barrier     */
/* that will guarantee correctness for the implementation.      */
/* Our job is to find the least expensive way to implement it   */
/* on the applicable hardware.  In many cases that will         */
/* involve, for example, a stronger memory barrier, or a        */
/* combination of hardware primitives.                          */
/*                                                              */
/* Conventions:                                                 */
/* "plain" atomic operations are not guaranteed to include      */
/* a barrier.  The suffix in the name specifies the barrier     */
/* type.  Suffixes are:                                         */
/* _release: Earlier operations may not be delayed past it.     */
/* _acquire: Later operations may not move ahead of it.         */
/* _read: Subsequent reads must follow this operation and       */
/*        preceding reads.                                      */
/* _write: Earlier writes precede both this operation and       */
/*        later writes.                                         */
/* _full: Ordered with respect to both earlier and later memops.*/
/* _release_write: Ordered with respect to earlier writes.      */
/* _acquire_read: Ordered with respect to later reads.          */
/*                                                              */
/* Currently we try to define the following atomic memory       */
/* operations, in combination with the above barriers:          */
/* AO_nop                                                       */
/* AO_load                                                      */
/* AO_store                                                     */
/* AO_test_and_set (binary)                                     */
/* AO_fetch_and_add                                             */
/* AO_fetch_and_add1                                            */
/* AO_fetch_and_sub1                                            */
/* AO_or                                                        */
/* AO_compare_and_swap                                          */
/*                                                              */
/* Note that atomicity guarantees are valid only if both        */
/* readers and writers use AO_ operations to access the         */
/* shared value, while ordering constraints are intended to     */
/* apply all memory operations.  If a location can potentially  */
/* be accessed simultaneously from multiple threads, and one of */
/* those accesses may be a write access, then all such          */
/* accesses to that location should be through AO_ primitives.  */
/* However if AO_ operations enforce sufficient ordering to     */
/* ensure that a location x cannot be accessed concurrently,    */
/* or can only be read concurrently, then x can be accessed     */
/* via ordinary references and assignments.                     */
/*                                                              */
/* Compare_and_exchange takes an address and an expected old    */
/* value and a new value, and returns an int.  Nonzero          */
/* indicates that it succeeded.                                 */
/* Test_and_set takes an address, atomically replaces it by     */
/* AO_TS_SET, and returns the prior value.                      */
/* An AO_TS_t location can be reset with the                    */
/* AO_CLEAR macro, which normally uses AO_store_release.        */
/* AO_fetch_and_add takes an address and an AO_t increment      */
/* value.  The AO_fetch_and_add1 and AO_fetch_and_sub1 variants */
/* are provided, since they allow faster implementations on     */
/* some hardware. AO_or atomically ors an AO_t value into a     */
/* memory location, but does not provide access to the original.*/
/*                                                              */
/* We expect this list to grow slowly over time.                */
/*                                                              */
/* Note that AO_nop_full is a full memory barrier.              */
/*                                                              */
/* Note that if some data is initialized with                   */
/*      data.x = ...; data.y = ...; ...                         */
/*      AO_store_release_write(&data_is_initialized, 1)         */
/* then data is guaranteed to be initialized after the test     */
/*      if (AO_load_release_read(&data_is_initialized)) ...     */
/* succeeds.  Furthermore, this should generate near-optimal    */
/* code on all common platforms.                                */
/*                                                              */
/* All operations operate on unsigned AO_t, which               */
/* is the natural word size, and usually unsigned long.         */
/* It is possible to check whether a particular operation op    */
/* is available on a particular platform by checking whether    */
/* AO_HAVE_op is defined.  We make heavy use of these macros    */
/* internally.                                                  */

/* The rest of this file basically has three sections:          */
/*                                                              */
/* Some utility and default definitions.                        */
/*                                                              */
/* The architecture dependent section:                          */
/* This defines atomic operations that have direct hardware     */
/* support on a particular platform, mostly by including the    */
/* appropriate compiler- and hardware-dependent file.           */
/*                                                              */
/* The synthesis section:                                       */
/* This tries to define other atomic operations in terms of     */
/* those that are explicitly available on the platform.         */
/* This section is hardware independent.                        */
/* We make no attempt to synthesize operations in ways that     */
/* effectively introduce locks, except for the debugging/demo   */
/* pthread-based implementation at the beginning.  A more       */
/* realistic implementation that falls back to locks could be   */
/* added as a higher layer.  But that would sacrifice           */
/* usability from signal handlers.                              */
/* The synthesis section is implemented almost entirely in      */
/* atomic_ops_generalize.h.                                     */

/* Some common defaults.  Overridden for some architectures.    */
#define AO_t size_t

/* The test_and_set primitive returns an AO_TS_VAL_t value.     */
/* AO_TS_t is the type of an in-memory test-and-set location.   */

#define AO_TS_INITIALIZER (AO_t)AO_TS_CLEAR

/* Platform-dependent stuff:                                    */
#if defined(__GNUC__) || defined(_MSC_VER) || defined(__INTEL_COMPILER) \
        || defined(__DMC__) || defined(__WATCOMC__)
# define AO_INLINE static __inline
#elif defined(__sun)
# define AO_INLINE static inline
#else
# define AO_INLINE static
#endif

#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
# define AO_compiler_barrier() __asm__ __volatile__("" : : : "memory")
#elif defined(_MSC_VER) || defined(__DMC__) || defined(__BORLANDC__) \
        || defined(__WATCOMC__)
# if defined(_AMD64_) || defined(_M_X64) || _MSC_VER >= 1400
#   if defined(_WIN32_WCE)
/* #     include <cmnintrin.h> */
#   elif defined(_MSC_VER)
#     include <intrin.h>
#   endif
#   pragma intrinsic(_ReadWriteBarrier)
#   define AO_compiler_barrier() _ReadWriteBarrier()
        /* We assume this does not generate a fence instruction.        */
        /* The documentation is a bit unclear.                          */
# else
#   define AO_compiler_barrier() __asm { }
        /* The preceding implementation may be preferable here too.     */
        /* But the documentation warns about VC++ 2003 and earlier.     */
# endif
#elif defined(__INTEL_COMPILER)
# define AO_compiler_barrier() __memory_barrier() /* Too strong? IA64-only? */
#elif defined(_HPUX_SOURCE)
# if defined(__ia64)
#   include <machine/sys/inline.h>
#   define AO_compiler_barrier() _Asm_sched_fence()
# else
    /* FIXME - We dont know how to do this.  This is a guess.   */
    /* And probably a bad one.                                  */
    static volatile int AO_barrier_dummy;
#   define AO_compiler_barrier() AO_barrier_dummy = AO_barrier_dummy
# endif
#else
  /* We conjecture that the following usually gives us the right        */
  /* semantics or an error.                                             */
# define AO_compiler_barrier() asm("")
#endif

#if defined(AO_USE_PTHREAD_DEFS)
# include "atomic_ops/sysdeps/generic_pthread.h"
#endif /* AO_USE_PTHREAD_DEFS */

#if defined(__GNUC__) && !defined(AO_USE_PTHREAD_DEFS) \
    && !defined(__INTEL_COMPILER)
# if defined(__i386__)
    /* We don't define AO_USE_SYNC_CAS_BUILTIN for x86 here because     */
    /* it might require specifying additional options (like -march)     */
    /* or additional link libraries (if -march is not specified).       */
#   include "./x86.h"
# endif /* __i386__ */
# if defined(__x86_64__)
#   if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2)
      /* It is safe to use __sync CAS built-in on this architecture.    */
#     define AO_USE_SYNC_CAS_BUILTIN
#   endif
#   include "./x86_64.h"
# endif /* __x86_64__ */
# if defined(__ia64__)
#   include "./ia64.h"
#   define AO_GENERALIZE_TWICE
# endif /* __ia64__ */
# if defined(__hppa__)
#   include "atomic_ops/sysdeps/gcc/hppa.h"
#   define AO_CAN_EMUL_CAS
# endif /* __hppa__ */
# if defined(__alpha__)
#   include "atomic_ops/sysdeps/gcc/alpha.h"
#   define AO_GENERALIZE_TWICE
# endif /* __alpha__ */
# if defined(__s390__)
#   include "atomic_ops/sysdeps/gcc/s390.h"
# endif /* __s390__ */
# if defined(__sparc__)
#   include "./sparc.h"
#   define AO_CAN_EMUL_CAS
# endif /* __sparc__ */
# if defined(__m68k__)
#   include "atomic_ops/sysdeps/gcc/m68k.h"
# endif /* __m68k__ */
# if defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) \
     || defined(__powerpc64__) || defined(__ppc64__)
#   include "./powerpc.h"
# endif /* __powerpc__ */
# if defined(__arm__) && !defined(AO_USE_PTHREAD_DEFS)
#   include "atomic_ops/sysdeps/gcc/arm.h"
#   define AO_CAN_EMUL_CAS
# endif /* __arm__ */
# if defined(__cris__) || defined(CRIS)
#   include "atomic_ops/sysdeps/gcc/cris.h"
# endif
# if defined(__mips__)
#   include "atomic_ops/sysdeps/gcc/mips.h"
# endif /* __mips__ */
# if defined(__sh__) || defined(SH4)
#   include "atomic_ops/sysdeps/gcc/sh.h"
#   define AO_CAN_EMUL_CAS
# endif /* __sh__ */
#endif /* __GNUC__ && !AO_USE_PTHREAD_DEFS */

#if defined(__INTEL_COMPILER) && !defined(AO_USE_PTHREAD_DEFS)
# if defined(__ia64__)
#   include "./ia64.h"
#   define AO_GENERALIZE_TWICE
# endif
# if defined(__GNUC__)
    /* Intel Compiler in GCC compatible mode */
#   if defined(__i386__)
#     include "./x86.h"
#   endif /* __i386__ */
#   if defined(__x86_64__)
#     if __INTEL_COMPILER > 1110
#       define AO_USE_SYNC_CAS_BUILTIN
#     endif
#     include "./x86_64.h"
#   endif /* __x86_64__ */
# endif
#endif

#if defined(_HPUX_SOURCE) && !defined(__GNUC__) && !defined(AO_USE_PTHREAD_DEFS)
# if defined(__ia64)
#   include "atomic_ops/sysdeps/hpc/ia64.h"
#   define AO_GENERALIZE_TWICE
# else
#   include "atomic_ops/sysdeps/hpc/hppa.h"
#   define AO_CAN_EMUL_CAS
# endif
#endif

#if defined(__sun) && !defined(__GNUC__) && !defined(AO_USE_PTHREAD_DEFS)
  /* Note: use -DAO_USE_PTHREAD_DEFS if Sun CC does not handle inline asm. */
# if defined(__i386)
#   include "atomic_ops/sysdeps/sunc/x86.h"
# endif /* __i386 */
# if defined(__x86_64) || defined(__amd64)
#   include "atomic_ops/sysdeps/sunc/x86_64.h"
# endif /* __x86_64 */
#endif

#if !defined(__GNUC__) && (defined(sparc) || defined(__sparc)) \
    && !defined(AO_USE_PTHREAD_DEFS)
#   include "atomic_ops/sysdeps/sunc/sparc.h"
#   define AO_CAN_EMUL_CAS
#endif

#if defined(_MSC_VER) || defined(__DMC__) || defined(__BORLANDC__) \
        || (defined(__WATCOMC__) && defined(__NT__))
# if defined(_AMD64_) || defined(_M_X64)
#   include "atomic_ops/sysdeps/msftc/x86_64.h"
# elif defined(_M_IX86) || defined(x86)
#   include "atomic_ops/sysdeps/msftc/x86.h"
# elif defined(_M_ARM) || defined(ARM) || defined(_ARM_)
#   include "atomic_ops/sysdeps/msftc/arm.h"
# endif
#endif

#if defined(AO_REQUIRE_CAS) && !defined(AO_HAVE_compare_and_swap) \
    && !defined(AO_HAVE_compare_and_swap_full) \
    && !defined(AO_HAVE_compare_and_swap_acquire)
# if defined(AO_CAN_EMUL_CAS)
#   include "atomic_ops/sysdeps/emul_cas.h"
# else
#  error Cannot implement AO_compare_and_swap_full on this architecture.
# endif
#endif  /* AO_REQUIRE_CAS && !AO_HAVE_compare_and_swap ... */

/* The most common way to clear a test-and-set location         */
/* at the end of a critical section.                            */
#if AO_AO_TS_T && !defined(AO_CLEAR)
# define AO_CLEAR(addr) AO_store_release((AO_TS_t *)(addr), AO_TS_CLEAR)
#endif
#if AO_CHAR_TS_T && !defined(AO_CLEAR)
# define AO_CLEAR(addr) AO_char_store_release((AO_TS_t *)(addr), AO_TS_CLEAR)
#endif

/*
 * The generalization section.
 * Theoretically this should repeatedly include atomic_ops_generalize.h.
 * In fact, we observe that this converges after a small fixed number
 * of iterations, usually one.
 */
#include "./generalize.h"
#ifdef AO_GENERALIZE_TWICE
# include "./generalize.h"
#endif

/* For compatibility with version 0.4 and earlier       */
#define AO_TS_T AO_TS_t
#define AO_T AO_t
#define AO_TS_VAL AO_TS_VAL_t

#endif /* ATOMIC_OPS_H */


================================================
FILE: stms/tinystm/src/atomic_ops/generalize-small.h
================================================
/* char_load */
#if defined(AO_HAVE_char_load_acquire) && !defined(AO_HAVE_char_load)
#  define AO_char_load(addr) AO_char_load_acquire(addr)
#  define AO_HAVE_char_load
#endif

#if defined(AO_HAVE_char_load_full) && !defined(AO_HAVE_char_load_acquire)
#  define AO_char_load_acquire(addr) AO_char_load_full(addr)
#  define AO_HAVE_char_load_acquire
#endif

#if defined(AO_HAVE_char_load_full) && !defined(AO_HAVE_char_load_read)
#  define AO_char_load_read(addr) AO_char_load_full(addr)
#  define AO_HAVE_char_load_read
#endif

#if !defined(AO_HAVE_char_load_acquire_read) && defined(AO_HAVE_char_load_acquire)
#  define AO_char_load_acquire_read(addr) AO_char_load_acquire(addr)
#  define AO_HAVE_char_load_acquire_read
#endif

#if defined(AO_HAVE_char_load) && defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_char_load_acquire)
   AO_INLINE unsigned char
   AO_char_load_acquire(const volatile unsigned char *addr)
   {
     unsigned char result = AO_char_load(addr);
     /* Acquire barrier would be useless, since the load could be delayed  */
     /* beyond it.                                                         */
     AO_nop_full();
     return result;
   }
#  define AO_HAVE_char_load_acquire
#endif

#if defined(AO_HAVE_char_load) && defined(AO_HAVE_nop_read) && \
    !defined(AO_HAVE_char_load_read)
   AO_INLINE unsigned char
   AO_char_load_read(const volatile unsigned char *addr)
   {
     unsigned char result = AO_char_load(addr);
     /* Acquire barrier would be useless, since the load could be delayed  */
     /* beyond it.                                                         */
     AO_nop_read();
     return result;
   }
#  define AO_HAVE_char_load_read
#endif

#if defined(AO_HAVE_char_load_acquire) && defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_char_load_full)
#  define AO_char_load_full(addr) (AO_nop_full(), AO_char_load_acquire(addr))
#  define AO_HAVE_char_load_full
#endif

#if !defined(AO_HAVE_char_load_acquire_read) && defined(AO_HAVE_char_load_read)
#  define AO_char_load_acquire_read(addr) AO_char_load_read(addr)
#  define AO_HAVE_char_load_acquire_read
#endif

#if defined(AO_HAVE_char_load_acquire_read) && !defined(AO_HAVE_char_load)
#  define AO_char_load(addr) AO_char_load_acquire_read(addr)
#  define AO_HAVE_char_load
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_char_load_acquire_read)
#    define AO_char_load_dd_acquire_read(addr) \
        AO_char_load_acquire_read(addr)
#    define AO_HAVE_char_load_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_char_load)
#    define AO_char_load_dd_acquire_read(addr) \
        AO_char_load(addr)
#    define AO_HAVE_char_load_dd_acquire_read
#  endif
#endif


/* char_store */

#if defined(AO_HAVE_char_store_release) && !defined(AO_HAVE_char_store)
#  define AO_char_store(addr, val) AO_char_store_release(addr,val)
#  define AO_HAVE_char_store
#endif

#if defined(AO_HAVE_char_store_full) && !defined(AO_HAVE_char_store_release)
#  define AO_char_store_release(addr,val) AO_char_store_full(addr,val)
#  define AO_HAVE_char_store_release
#endif

#if defined(AO_HAVE_char_store_full) && !defined(AO_HAVE_char_store_write)
#  define AO_char_store_write(addr,val) AO_char_store_full(addr,val)
#  define AO_HAVE_char_store_write
#endif

#if defined(AO_HAVE_char_store_release) && \
        !defined(AO_HAVE_char_store_release_write)
#  define AO_char_store_release_write(addr, val) \
        AO_char_store_release(addr,val)
#  define AO_HAVE_char_store_release_write
#endif

#if defined(AO_HAVE_char_store_write) && !defined(AO_HAVE_char_store)
#  define AO_char_store(addr, val) AO_char_store_write(addr,val)
#  define AO_HAVE_char_store
#endif

#if defined(AO_HAVE_char_store) && defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_char_store_release)
#  define AO_char_store_release(addr,val) \
        (AO_nop_full(), AO_char_store(addr,val))
#  define AO_HAVE_char_store_release
#endif

#if defined(AO_HAVE_nop_write) && defined(AO_HAVE_char_store) && \
     !defined(AO_HAVE_char_store_write)
#  define AO_char_store_write(addr, val) \
        (AO_nop_write(), AO_char_store(addr,val))
#  define AO_HAVE_char_store_write
#endif

#if defined(AO_HAVE_char_store_write) && \
     !defined(AO_HAVE_char_store_release_write)
#  define AO_char_store_release_write(addr, val) AO_char_store_write(addr,val)
#  define AO_HAVE_char_store_release_write
#endif

#if defined(AO_HAVE_char_store_release) && defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_char_store_full)
#  define AO_char_store_full(addr, val) \
        (AO_char_store_release(addr, val), AO_nop_full())
#  define AO_HAVE_char_store_full
#endif


/* char_fetch_and_add */
#if defined(AO_HAVE_char_compare_and_swap_full) && \
    !defined(AO_HAVE_char_fetch_and_add_full)
   AO_INLINE AO_t
   AO_char_fetch_and_add_full(volatile unsigned char *addr,
                               unsigned char incr)
   {
     unsigned char old;
     do
       {
         old = *addr;
       }
     while (!AO_char_compare_and_swap_full(addr, old, old+incr));
     return old;
   }
#  define AO_HAVE_char_fetch_and_add_full
#endif

#if defined(AO_HAVE_char_compare_and_swap_acquire) && \
    !defined(AO_HAVE_char_fetch_and_add_acquire)
   AO_INLINE AO_t
   AO_char_fetch_and_add_acquire(volatile unsigned char *addr,
                                  unsigned char incr)
   {
     unsigned char old;
     do
       {
         old = *addr;
       }
     while (!AO_char_compare_and_swap_acquire(addr, old, old+incr));
     return old;
   }
#  define AO_HAVE_char_fetch_and_add_acquire
#endif

#if defined(AO_HAVE_char_compare_and_swap_release) && \
    !defined(AO_HAVE_char_fetch_and_add_release)
   AO_INLINE AO_t
   AO_char_fetch_and_add_release(volatile unsigned char *addr,
                                  unsigned char incr)
   {
     unsigned char old;
     do
       {
         old = *addr;
       }
     while (!AO_char_compare_and_swap_release(addr, old, old+incr));
     return old;
   }
#  define AO_HAVE_char_fetch_and_add_release
#endif

#if defined(AO_HAVE_char_fetch_and_add_full)
#  if !defined(AO_HAVE_char_fetch_and_add_release)
#    define AO_char_fetch_and_add_release(addr, val) \
         AO_char_fetch_and_add_full(addr, val)
#    define AO_HAVE_char_fetch_and_add_release
#  endif
#  if !defined(AO_HAVE_char_fetch_and_add_acquire)
#    define AO_char_fetch_and_add_acquire(addr, val) \
         AO_char_fetch_and_add_full(addr, val)
#    define AO_HAVE_char_fetch_and_add_acquire
#  endif
#  if !defined(AO_HAVE_char_fetch_and_add_write)
#    define AO_char_fetch_and_add_write(addr, val) \
         AO_char_fetch_and_add_full(addr, val)
#    define AO_HAVE_char_fetch_and_add_write
#  endif
#  if !defined(AO_HAVE_char_fetch_and_add_read)
#    define AO_char_fetch_and_add_read(addr, val) \
         AO_char_fetch_and_add_full(addr, val)
#    define AO_HAVE_char_fetch_and_add_read
#  endif
#endif /* AO_HAVE_char_fetch_and_add_full */

#if !defined(AO_HAVE_char_fetch_and_add) && \
    defined(AO_HAVE_char_fetch_and_add_release)
#  define AO_char_fetch_and_add(addr, val) \
        AO_char_fetch_and_add_release(addr, val)
#  define AO_HAVE_char_fetch_and_add
#endif
#if !defined(AO_HAVE_char_fetch_and_add) && \
    defined(AO_HAVE_char_fetch_and_add_acquire)
#  define AO_char_fetch_and_add(addr, val) \
        AO_char_fetch_and_add_acquire(addr, val)
#  define AO_HAVE_char_fetch_and_add
#endif
#if !defined(AO_HAVE_char_fetch_and_add) && \
    defined(AO_HAVE_char_fetch_and_add_write)
#  define AO_char_fetch_and_add(addr, val) \
        AO_char_fetch_and_add_write(addr, val)
#  define AO_HAVE_char_fetch_and_add
#endif
#if !defined(AO_HAVE_char_fetch_and_add) && \
    defined(AO_HAVE_char_fetch_and_add_read)
#  define AO_char_fetch_and_add(addr, val) \
        AO_char_fetch_and_add_read(addr, val)
#  define AO_HAVE_char_fetch_and_add
#endif

#if defined(AO_HAVE_char_fetch_and_add_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_char_fetch_and_add_full)
#  define AO_char_fetch_and_add_full(addr, val) \
        (AO_nop_full(), AO_char_fetch_and_add_acquire(addr, val))
#endif

#if !defined(AO_HAVE_char_fetch_and_add_release_write) && \
    defined(AO_HAVE_char_fetch_and_add_write)
#  define AO_char_fetch_and_add_release_write(addr, val) \
        AO_char_fetch_and_add_write(addr, val)
#  define AO_HAVE_char_fetch_and_add_release_write
#endif
#if !defined(AO_HAVE_char_fetch_and_add_release_write) && \
    defined(AO_HAVE_char_fetch_and_add_release)
#  define AO_char_fetch_and_add_release_write(addr, val) \
        AO_char_fetch_and_add_release(addr, val)
#  define AO_HAVE_char_fetch_and_add_release_write
#endif
#if !defined(AO_HAVE_char_fetch_and_add_acquire_read) && \
    defined(AO_HAVE_char_fetch_and_add_read)
#  define AO_char_fetch_and_add_acquire_read(addr, val) \
        AO_char_fetch_and_add_read(addr, val)
#  define AO_HAVE_char_fetch_and_add_acquire_read
#endif
#if !defined(AO_HAVE_char_fetch_and_add_acquire_read) && \
    defined(AO_HAVE_char_fetch_and_add_acquire)
#  define AO_char_fetch_and_add_acquire_read(addr, val) \
        AO_char_fetch_and_add_acquire(addr, val)
#  define AO_HAVE_char_fetch_and_add_acquire_read
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_char_fetch_and_add_acquire_read)
#    define AO_char_fetch_and_add_dd_acquire_read(addr, val) \
        AO_char_fetch_and_add_acquire_read(addr, val)
#    define AO_HAVE_char_fetch_and_add_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_char_fetch_and_add)
#    define AO_char_fetch_and_add_dd_acquire_read(addr, val) \
        AO_char_fetch_and_add(addr, val)
#    define AO_HAVE_char_fetch_and_add_dd_acquire_read
#  endif
#endif

/* char_fetch_and_add1 */

#if defined(AO_HAVE_char_fetch_and_add_full) &&\
    !defined(AO_HAVE_char_fetch_and_add1_full)
#  define AO_char_fetch_and_add1_full(addr) \
        AO_char_fetch_and_add_full(addr,1)
#  define AO_HAVE_char_fetch_and_add1_full
#endif
#if defined(AO_HAVE_char_fetch_and_add_release) &&\
    !defined(AO_HAVE_char_fetch_and_add1_release)
#  define AO_char_fetch_and_add1_release(addr) \
        AO_char_fetch_and_add_release(addr,1)
#  define AO_HAVE_char_fetch_and_add1_release
#endif
#if defined(AO_HAVE_char_fetch_and_add_acquire) &&\
    !defined(AO_HAVE_char_fetch_and_add1_acquire)
#  define AO_char_fetch_and_add1_acquire(addr) \
        AO_char_fetch_and_add_acquire(addr,1)
#  define AO_HAVE_char_fetch_and_add1_acquire
#endif
#if defined(AO_HAVE_char_fetch_and_add_write) &&\
    !defined(AO_HAVE_char_fetch_and_add1_write)
#  define AO_char_fetch_and_add1_write(addr) \
        AO_char_fetch_and_add_write(addr,1)
#  define AO_HAVE_char_fetch_and_add1_write
#endif
#if defined(AO_HAVE_char_fetch_and_add_read) &&\
    !defined(AO_HAVE_char_fetch_and_add1_read)
#  define AO_char_fetch_and_add1_read(addr) \
        AO_char_fetch_and_add_read(addr,1)
#  define AO_HAVE_char_fetch_and_add1_read
#endif
#if defined(AO_HAVE_char_fetch_and_add_release_write) &&\
    !defined(AO_HAVE_char_fetch_and_add1_release_write)
#  define AO_char_fetch_and_add1_release_write(addr) \
        AO_char_fetch_and_add_release_write(addr,1)
#  define AO_HAVE_char_fetch_and_add1_release_write
#endif
#if defined(AO_HAVE_char_fetch_and_add_acquire_read) &&\
    !defined(AO_HAVE_char_fetch_and_add1_acquire_read)
#  define AO_char_fetch_and_add1_acquire_read(addr) \
        AO_char_fetch_and_add_acquire_read(addr,1)
#  define AO_HAVE_char_fetch_and_add1_acquire_read
#endif
#if defined(AO_HAVE_char_fetch_and_add) &&\
    !defined(AO_HAVE_char_fetch_and_add1)
#  define AO_char_fetch_and_add1(addr) \
        AO_char_fetch_and_add(addr,1)
#  define AO_HAVE_char_fetch_and_add1
#endif

#if defined(AO_HAVE_char_fetch_and_add1_full)
#  if !defined(AO_HAVE_char_fetch_and_add1_release)
#    define AO_char_fetch_and_add1_release(addr) \
         AO_char_fetch_and_add1_full(addr)
#    define AO_HAVE_char_fetch_and_add1_release
#  endif
#  if !defined(AO_HAVE_char_fetch_and_add1_acquire)
#    define AO_char_fetch_and_add1_acquire(addr) \
         AO_char_fetch_and_add1_full(addr)
#    define AO_HAVE_char_fetch_and_add1_acquire
#  endif
#  if !defined(AO_HAVE_char_fetch_and_add1_write)
#    define AO_char_fetch_and_add1_write(addr) \
         AO_char_fetch_and_add1_full(addr)
#    define AO_HAVE_char_fetch_and_add1_write
#  endif
#  if !defined(AO_HAVE_char_fetch_and_add1_read)
#    define AO_char_fetch_and_add1_read(addr) \
         AO_char_fetch_and_add1_full(addr)
#    define AO_HAVE_char_fetch_and_add1_read
#  endif
#endif /* AO_HAVE_char_fetch_and_add1_full */

#if !defined(AO_HAVE_char_fetch_and_add1) && \
    defined(AO_HAVE_char_fetch_and_add1_release)
#  define AO_char_fetch_and_add1(addr) \
        AO_char_fetch_and_add1_release(addr)
#  define AO_HAVE_char_fetch_and_add1
#endif
#if !defined(AO_HAVE_char_fetch_and_add1) && \
    defined(AO_HAVE_char_fetch_and_add1_acquire)
#  define AO_char_fetch_and_add1(addr) \
        AO_char_fetch_and_add1_acquire(addr)
#  define AO_HAVE_char_fetch_and_add1
#endif
#if !defined(AO_HAVE_char_fetch_and_add1) && \
    defined(AO_HAVE_char_fetch_and_add1_write)
#  define AO_char_fetch_and_add1(addr) \
        AO_char_fetch_and_add1_write(addr)
#  define AO_HAVE_char_fetch_and_add1
#endif
#if !defined(AO_HAVE_char_fetch_and_add1) && \
    defined(AO_HAVE_char_fetch_and_add1_read)
#  define AO_char_fetch_and_add1(addr) \
        AO_char_fetch_and_add1_read(addr)
#  define AO_HAVE_char_fetch_and_add1
#endif

#if defined(AO_HAVE_char_fetch_and_add1_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_char_fetch_and_add1_full)
#  define AO_char_fetch_and_add1_full(addr) \
        (AO_nop_full(), AO_char_fetch_and_add1_acquire(addr))
#  define AO_HAVE_char_fetch_and_add1_full
#endif

#if !defined(AO_HAVE_char_fetch_and_add1_release_write) && \
    defined(AO_HAVE_char_fetch_and_add1_write)
#  define AO_char_fetch_and_add1_release_write(addr) \
        AO_char_fetch_and_add1_write(addr)
#  define AO_HAVE_char_fetch_and_add1_release_write
#endif
#if !defined(AO_HAVE_char_fetch_and_add1_release_write) && \
    defined(AO_HAVE_char_fetch_and_add1_release)
#  define AO_char_fetch_and_add1_release_write(addr) \
        AO_char_fetch_and_add1_release(addr)
#  define AO_HAVE_char_fetch_and_add1_release_write
#endif
#if !defined(AO_HAVE_char_fetch_and_add1_acquire_read) && \
    defined(AO_HAVE_char_fetch_and_add1_read)
#  define AO_char_fetch_and_add1_acquire_read(addr) \
        AO_char_fetch_and_add1_read(addr)
#  define AO_HAVE_char_fetch_and_add1_acquire_read
#endif
#if !defined(AO_HAVE_char_fetch_and_add1_acquire_read) && \
    defined(AO_HAVE_char_fetch_and_add1_acquire)
#  define AO_char_fetch_and_add1_acquire_read(addr) \
        AO_char_fetch_and_add1_acquire(addr)
#  define AO_HAVE_char_fetch_and_add1_acquire_read
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_char_fetch_and_add1_acquire_read)
#    define AO_char_fetch_and_add1_dd_acquire_read(addr) \
        AO_char_fetch_and_add1_acquire_read(addr)
#    define AO_HAVE_char_fetch_and_add1_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_char_fetch_and_add1)
#    define AO_char_fetch_and_add1_dd_acquire_read(addr) \
        AO_char_fetch_and_add1(addr)
#    define AO_HAVE_char_fetch_and_add1_dd_acquire_read
#  endif
#endif

/* char_fetch_and_sub1 */

#if defined(AO_HAVE_char_fetch_and_add_full) &&\
    !defined(AO_HAVE_char_fetch_and_sub1_full)
#  define AO_char_fetch_and_sub1_full(addr) \
        AO_char_fetch_and_add_full(addr,(unsigned char)(-1))
#  define AO_HAVE_char_fetch_and_sub1_full
#endif
#if defined(AO_HAVE_char_fetch_and_add_release) &&\
    !defined(AO_HAVE_char_fetch_and_sub1_release)
#  define AO_char_fetch_and_sub1_release(addr) \
        AO_char_fetch_and_add_release(addr,(unsigned char)(-1))
#  define AO_HAVE_char_fetch_and_sub1_release
#endif
#if defined(AO_HAVE_char_fetch_and_add_acquire) &&\
    !defined(AO_HAVE_char_fetch_and_sub1_acquire)
#  define AO_char_fetch_and_sub1_acquire(addr) \
        AO_char_fetch_and_add_acquire(addr,(unsigned char)(-1))
#  define AO_HAVE_char_fetch_and_sub1_acquire
#endif
#if defined(AO_HAVE_char_fetch_and_add_write) &&\
    !defined(AO_HAVE_char_fetch_and_sub1_write)
#  define AO_char_fetch_and_sub1_write(addr) \
        AO_char_fetch_and_add_write(addr,(unsigned char)(-1))
#  define AO_HAVE_char_fetch_and_sub1_write
#endif
#if defined(AO_HAVE_char_fetch_and_add_read) &&\
    !defined(AO_HAVE_char_fetch_and_sub1_read)
#  define AO_char_fetch_and_sub1_read(addr) \
        AO_char_fetch_and_add_read(addr,(unsigned char)(-1))
#  define AO_HAVE_char_fetch_and_sub1_read
#endif
#if defined(AO_HAVE_char_fetch_and_add_release_write) &&\
    !defined(AO_HAVE_char_fetch_and_sub1_release_write)
#  define AO_char_fetch_and_sub1_release_write(addr) \
        AO_char_fetch_and_add_release_write(addr,(unsigned char)(-1))
#  define AO_HAVE_char_fetch_and_sub1_release_write
#endif
#if defined(AO_HAVE_char_fetch_and_add_acquire_read) &&\
    !defined(AO_HAVE_char_fetch_and_sub1_acquire_read)
#  define AO_char_fetch_and_sub1_acquire_read(addr) \
        AO_char_fetch_and_add_acquire_read(addr,(unsigned char)(-1))
#  define AO_HAVE_char_fetch_and_sub1_acquire_read
#endif
#if defined(AO_HAVE_char_fetch_and_add) &&\
    !defined(AO_HAVE_char_fetch_and_sub1)
#  define AO_char_fetch_and_sub1(addr) \
        AO_char_fetch_and_add(addr,(unsigned char)(-1))
#  define AO_HAVE_char_fetch_and_sub1
#endif

#if defined(AO_HAVE_char_fetch_and_sub1_full)
#  if !defined(AO_HAVE_char_fetch_and_sub1_release)
#    define AO_char_fetch_and_sub1_release(addr) \
         AO_char_fetch_and_sub1_full(addr)
#    define AO_HAVE_char_fetch_and_sub1_release
#  endif
#  if !defined(AO_HAVE_char_fetch_and_sub1_acquire)
#    define AO_char_fetch_and_sub1_acquire(addr) \
         AO_char_fetch_and_sub1_full(addr)
#    define AO_HAVE_char_fetch_and_sub1_acquire
#  endif
#  if !defined(AO_HAVE_char_fetch_and_sub1_write)
#    define AO_char_fetch_and_sub1_write(addr) \
         AO_char_fetch_and_sub1_full(addr)
#    define AO_HAVE_char_fetch_and_sub1_write
#  endif
#  if !defined(AO_HAVE_char_fetch_and_sub1_read)
#    define AO_char_fetch_and_sub1_read(addr) \
         AO_char_fetch_and_sub1_full(addr)
#    define AO_HAVE_char_fetch_and_sub1_read
#  endif
#endif /* AO_HAVE_char_fetch_and_sub1_full */

#if !defined(AO_HAVE_char_fetch_and_sub1) && \
    defined(AO_HAVE_char_fetch_and_sub1_release)
#  define AO_char_fetch_and_sub1(addr) \
        AO_char_fetch_and_sub1_release(addr)
#  define AO_HAVE_char_fetch_and_sub1
#endif
#if !defined(AO_HAVE_char_fetch_and_sub1) && \
    defined(AO_HAVE_char_fetch_and_sub1_acquire)
#  define AO_char_fetch_and_sub1(addr) \
        AO_char_fetch_and_sub1_acquire(addr)
#  define AO_HAVE_char_fetch_and_sub1
#endif
#if !defined(AO_HAVE_char_fetch_and_sub1) && \
    defined(AO_HAVE_char_fetch_and_sub1_write)
#  define AO_char_fetch_and_sub1(addr) \
        AO_char_fetch_and_sub1_write(addr)
#  define AO_HAVE_char_fetch_and_sub1
#endif
#if !defined(AO_HAVE_char_fetch_and_sub1) && \
    defined(AO_HAVE_char_fetch_and_sub1_read)
#  define AO_char_fetch_and_sub1(addr) \
        AO_char_fetch_and_sub1_read(addr)
#  define AO_HAVE_char_fetch_and_sub1
#endif

#if defined(AO_HAVE_char_fetch_and_sub1_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_char_fetch_and_sub1_full)
#  define AO_char_fetch_and_sub1_full(addr) \
        (AO_nop_full(), AO_char_fetch_and_sub1_acquire(addr))
#  define AO_HAVE_char_fetch_and_sub1_full
#endif

#if !defined(AO_HAVE_char_fetch_and_sub1_release_write) && \
    defined(AO_HAVE_char_fetch_and_sub1_write)
#  define AO_char_fetch_and_sub1_release_write(addr) \
        AO_char_fetch_and_sub1_write(addr)
#  define AO_HAVE_char_fetch_and_sub1_release_write
#endif
#if !defined(AO_HAVE_char_fetch_and_sub1_release_write) && \
    defined(AO_HAVE_char_fetch_and_sub1_release)
#  define AO_char_fetch_and_sub1_release_write(addr) \
        AO_char_fetch_and_sub1_release(addr)
#  define AO_HAVE_char_fetch_and_sub1_release_write
#endif
#if !defined(AO_HAVE_char_fetch_and_sub1_acquire_read) && \
    defined(AO_HAVE_char_fetch_and_sub1_read)
#  define AO_char_fetch_and_sub1_acquire_read(addr) \
        AO_char_fetch_and_sub1_read(addr)
#  define AO_HAVE_char_fetch_and_sub1_acquire_read
#endif
#if !defined(AO_HAVE_char_fetch_and_sub1_acquire_read) && \
    defined(AO_HAVE_char_fetch_and_sub1_acquire)
#  define AO_char_fetch_and_sub1_acquire_read(addr) \
        AO_char_fetch_and_sub1_acquire(addr)
#  define AO_HAVE_char_fetch_and_sub1_acquire_read
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_char_fetch_and_sub1_acquire_read)
#    define AO_char_fetch_and_sub1_dd_acquire_read(addr) \
        AO_char_fetch_and_sub1_acquire_read(addr)
#    define AO_HAVE_char_fetch_and_sub1_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_char_fetch_and_sub1)
#    define AO_char_fetch_and_sub1_dd_acquire_read(addr) \
        AO_char_fetch_and_sub1(addr)
#    define AO_HAVE_char_fetch_and_sub1_dd_acquire_read
#  endif
#endif

/* short_load */
#if defined(AO_HAVE_short_load_acquire) && !defined(AO_HAVE_short_load)
#  define AO_short_load(addr) AO_short_load_acquire(addr)
#  define AO_HAVE_short_load
#endif

#if defined(AO_HAVE_short_load_full) && !defined(AO_HAVE_short_load_acquire)
#  define AO_short_load_acquire(addr) AO_short_load_full(addr)
#  define AO_HAVE_short_load_acquire
#endif

#if defined(AO_HAVE_short_load_full) && !defined(AO_HAVE_short_load_read)
#  define AO_short_load_read(addr) AO_short_load_full(addr)
#  define AO_HAVE_short_load_read
#endif

#if !defined(AO_HAVE_short_load_acquire_read) && defined(AO_HAVE_short_load_acquire)
#  define AO_short_load_acquire_read(addr) AO_short_load_acquire(addr)
#  define AO_HAVE_short_load_acquire_read
#endif

#if defined(AO_HAVE_short_load) && defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_short_load_acquire)
   AO_INLINE unsigned short
   AO_short_load_acquire(const volatile unsigned short *addr)
   {
     unsigned short result = AO_short_load(addr);
     /* Acquire barrier would be useless, since the load could be delayed  */
     /* beyond it.                                                         */
     AO_nop_full();
     return result;
   }
#  define AO_HAVE_short_load_acquire
#endif

#if defined(AO_HAVE_short_load) && defined(AO_HAVE_nop_read) && \
    !defined(AO_HAVE_short_load_read)
   AO_INLINE unsigned short
   AO_short_load_read(const volatile unsigned short *addr)
   {
     unsigned short result = AO_short_load(addr);
     /* Acquire barrier would be useless, since the load could be delayed  */
     /* beyond it.                                                         */
     AO_nop_read();
     return result;
   }
#  define AO_HAVE_short_load_read
#endif

#if defined(AO_HAVE_short_load_acquire) && defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_short_load_full)
#  define AO_short_load_full(addr) (AO_nop_full(), AO_short_load_acquire(addr))
#  define AO_HAVE_short_load_full
#endif

#if !defined(AO_HAVE_short_load_acquire_read) && defined(AO_HAVE_short_load_read)
#  define AO_short_load_acquire_read(addr) AO_short_load_read(addr)
#  define AO_HAVE_short_load_acquire_read
#endif

#if defined(AO_HAVE_short_load_acquire_read) && !defined(AO_HAVE_short_load)
#  define AO_short_load(addr) AO_short_load_acquire_read(addr)
#  define AO_HAVE_short_load
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_short_load_acquire_read)
#    define AO_short_load_dd_acquire_read(addr) \
        AO_short_load_acquire_read(addr)
#    define AO_HAVE_short_load_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_short_load)
#    define AO_short_load_dd_acquire_read(addr) \
        AO_short_load(addr)
#    define AO_HAVE_short_load_dd_acquire_read
#  endif
#endif


/* short_store */

#if defined(AO_HAVE_short_store_release) && !defined(AO_HAVE_short_store)
#  define AO_short_store(addr, val) AO_short_store_release(addr,val)
#  define AO_HAVE_short_store
#endif

#if defined(AO_HAVE_short_store_full) && !defined(AO_HAVE_short_store_release)
#  define AO_short_store_release(addr,val) AO_short_store_full(addr,val)
#  define AO_HAVE_short_store_release
#endif

#if defined(AO_HAVE_short_store_full) && !defined(AO_HAVE_short_store_write)
#  define AO_short_store_write(addr,val) AO_short_store_full(addr,val)
#  define AO_HAVE_short_store_write
#endif

#if defined(AO_HAVE_short_store_release) && \
        !defined(AO_HAVE_short_store_release_write)
#  define AO_short_store_release_write(addr, val) \
        AO_short_store_release(addr,val)
#  define AO_HAVE_short_store_release_write
#endif

#if defined(AO_HAVE_short_store_write) && !defined(AO_HAVE_short_store)
#  define AO_short_store(addr, val) AO_short_store_write(addr,val)
#  define AO_HAVE_short_store
#endif

#if defined(AO_HAVE_short_store) && defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_short_store_release)
#  define AO_short_store_release(addr,val) \
        (AO_nop_full(), AO_short_store(addr,val))
#  define AO_HAVE_short_store_release
#endif

#if defined(AO_HAVE_nop_write) && defined(AO_HAVE_short_store) && \
     !defined(AO_HAVE_short_store_write)
#  define AO_short_store_write(addr, val) \
        (AO_nop_write(), AO_short_store(addr,val))
#  define AO_HAVE_short_store_write
#endif

#if defined(AO_HAVE_short_store_write) && \
     !defined(AO_HAVE_short_store_release_write)
#  define AO_short_store_release_write(addr, val) AO_short_store_write(addr,val)
#  define AO_HAVE_short_store_release_write
#endif

#if defined(AO_HAVE_short_store_release) && defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_short_store_full)
#  define AO_short_store_full(addr, val) \
        (AO_short_store_release(addr, val), AO_nop_full())
#  define AO_HAVE_short_store_full
#endif


/* short_fetch_and_add */
#if defined(AO_HAVE_short_compare_and_swap_full) && \
    !defined(AO_HAVE_short_fetch_and_add_full)
   AO_INLINE AO_t
   AO_short_fetch_and_add_full(volatile unsigned short *addr,
                               unsigned short incr)
   {
     unsigned short old;
     do
       {
         old = *addr;
       }
     while (!AO_short_compare_and_swap_full(addr, old, old+incr));
     return old;
   }
#  define AO_HAVE_short_fetch_and_add_full
#endif

#if defined(AO_HAVE_short_compare_and_swap_acquire) && \
    !defined(AO_HAVE_short_fetch_and_add_acquire)
   AO_INLINE AO_t
   AO_short_fetch_and_add_acquire(volatile unsigned short *addr,
                                  unsigned short incr)
   {
     unsigned short old;
     do
       {
         old = *addr;
       }
     while (!AO_short_compare_and_swap_acquire(addr, old, old+incr));
     return old;
   }
#  define AO_HAVE_short_fetch_and_add_acquire
#endif

#if defined(AO_HAVE_short_compare_and_swap_release) && \
    !defined(AO_HAVE_short_fetch_and_add_release)
   AO_INLINE AO_t
   AO_short_fetch_and_add_release(volatile unsigned short *addr,
                                  unsigned short incr)
   {
     unsigned short old;
     do
       {
         old = *addr;
       }
     while (!AO_short_compare_and_swap_release(addr, old, old+incr));
     return old;
   }
#  define AO_HAVE_short_fetch_and_add_release
#endif

#if defined(AO_HAVE_short_fetch_and_add_full)
#  if !defined(AO_HAVE_short_fetch_and_add_release)
#    define AO_short_fetch_and_add_release(addr, val) \
         AO_short_fetch_and_add_full(addr, val)
#    define AO_HAVE_short_fetch_and_add_release
#  endif
#  if !defined(AO_HAVE_short_fetch_and_add_acquire)
#    define AO_short_fetch_and_add_acquire(addr, val) \
         AO_short_fetch_and_add_full(addr, val)
#    define AO_HAVE_short_fetch_and_add_acquire
#  endif
#  if !defined(AO_HAVE_short_fetch_and_add_write)
#    define AO_short_fetch_and_add_write(addr, val) \
         AO_short_fetch_and_add_full(addr, val)
#    define AO_HAVE_short_fetch_and_add_write
#  endif
#  if !defined(AO_HAVE_short_fetch_and_add_read)
#    define AO_short_fetch_and_add_read(addr, val) \
         AO_short_fetch_and_add_full(addr, val)
#    define AO_HAVE_short_fetch_and_add_read
#  endif
#endif /* AO_HAVE_short_fetch_and_add_full */

#if !defined(AO_HAVE_short_fetch_and_add) && \
    defined(AO_HAVE_short_fetch_and_add_release)
#  define AO_short_fetch_and_add(addr, val) \
        AO_short_fetch_and_add_release(addr, val)
#  define AO_HAVE_short_fetch_and_add
#endif
#if !defined(AO_HAVE_short_fetch_and_add) && \
    defined(AO_HAVE_short_fetch_and_add_acquire)
#  define AO_short_fetch_and_add(addr, val) \
        AO_short_fetch_and_add_acquire(addr, val)
#  define AO_HAVE_short_fetch_and_add
#endif
#if !defined(AO_HAVE_short_fetch_and_add) && \
    defined(AO_HAVE_short_fetch_and_add_write)
#  define AO_short_fetch_and_add(addr, val) \
        AO_short_fetch_and_add_write(addr, val)
#  define AO_HAVE_short_fetch_and_add
#endif
#if !defined(AO_HAVE_short_fetch_and_add) && \
    defined(AO_HAVE_short_fetch_and_add_read)
#  define AO_short_fetch_and_add(addr, val) \
        AO_short_fetch_and_add_read(addr, val)
#  define AO_HAVE_short_fetch_and_add
#endif

#if defined(AO_HAVE_short_fetch_and_add_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_short_fetch_and_add_full)
#  define AO_short_fetch_and_add_full(addr, val) \
        (AO_nop_full(), AO_short_fetch_and_add_acquire(addr, val))
#endif

#if !defined(AO_HAVE_short_fetch_and_add_release_write) && \
    defined(AO_HAVE_short_fetch_and_add_write)
#  define AO_short_fetch_and_add_release_write(addr, val) \
        AO_short_fetch_and_add_write(addr, val)
#  define AO_HAVE_short_fetch_and_add_release_write
#endif
#if !defined(AO_HAVE_short_fetch_and_add_release_write) && \
    defined(AO_HAVE_short_fetch_and_add_release)
#  define AO_short_fetch_and_add_release_write(addr, val) \
        AO_short_fetch_and_add_release(addr, val)
#  define AO_HAVE_short_fetch_and_add_release_write
#endif
#if !defined(AO_HAVE_short_fetch_and_add_acquire_read) && \
    defined(AO_HAVE_short_fetch_and_add_read)
#  define AO_short_fetch_and_add_acquire_read(addr, val) \
        AO_short_fetch_and_add_read(addr, val)
#  define AO_HAVE_short_fetch_and_add_acquire_read
#endif
#if !defined(AO_HAVE_short_fetch_and_add_acquire_read) && \
    defined(AO_HAVE_short_fetch_and_add_acquire)
#  define AO_short_fetch_and_add_acquire_read(addr, val) \
        AO_short_fetch_and_add_acquire(addr, val)
#  define AO_HAVE_short_fetch_and_add_acquire_read
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_short_fetch_and_add_acquire_read)
#    define AO_short_fetch_and_add_dd_acquire_read(addr, val) \
        AO_short_fetch_and_add_acquire_read(addr, val)
#    define AO_HAVE_short_fetch_and_add_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_short_fetch_and_add)
#    define AO_short_fetch_and_add_dd_acquire_read(addr, val) \
        AO_short_fetch_and_add(addr, val)
#    define AO_HAVE_short_fetch_and_add_dd_acquire_read
#  endif
#endif

/* short_fetch_and_add1 */

#if defined(AO_HAVE_short_fetch_and_add_full) &&\
    !defined(AO_HAVE_short_fetch_and_add1_full)
#  define AO_short_fetch_and_add1_full(addr) \
        AO_short_fetch_and_add_full(addr,1)
#  define AO_HAVE_short_fetch_and_add1_full
#endif
#if defined(AO_HAVE_short_fetch_and_add_release) &&\
    !defined(AO_HAVE_short_fetch_and_add1_release)
#  define AO_short_fetch_and_add1_release(addr) \
        AO_short_fetch_and_add_release(addr,1)
#  define AO_HAVE_short_fetch_and_add1_release
#endif
#if defined(AO_HAVE_short_fetch_and_add_acquire) &&\
    !defined(AO_HAVE_short_fetch_and_add1_acquire)
#  define AO_short_fetch_and_add1_acquire(addr) \
        AO_short_fetch_and_add_acquire(addr,1)
#  define AO_HAVE_short_fetch_and_add1_acquire
#endif
#if defined(AO_HAVE_short_fetch_and_add_write) &&\
    !defined(AO_HAVE_short_fetch_and_add1_write)
#  define AO_short_fetch_and_add1_write(addr) \
        AO_short_fetch_and_add_write(addr,1)
#  define AO_HAVE_short_fetch_and_add1_write
#endif
#if defined(AO_HAVE_short_fetch_and_add_read) &&\
    !defined(AO_HAVE_short_fetch_and_add1_read)
#  define AO_short_fetch_and_add1_read(addr) \
        AO_short_fetch_and_add_read(addr,1)
#  define AO_HAVE_short_fetch_and_add1_read
#endif
#if defined(AO_HAVE_short_fetch_and_add_release_write) &&\
    !defined(AO_HAVE_short_fetch_and_add1_release_write)
#  define AO_short_fetch_and_add1_release_write(addr) \
        AO_short_fetch_and_add_release_write(addr,1)
#  define AO_HAVE_short_fetch_and_add1_release_write
#endif
#if defined(AO_HAVE_short_fetch_and_add_acquire_read) &&\
    !defined(AO_HAVE_short_fetch_and_add1_acquire_read)
#  define AO_short_fetch_and_add1_acquire_read(addr) \
        AO_short_fetch_and_add_acquire_read(addr,1)
#  define AO_HAVE_short_fetch_and_add1_acquire_read
#endif
#if defined(AO_HAVE_short_fetch_and_add) &&\
    !defined(AO_HAVE_short_fetch_and_add1)
#  define AO_short_fetch_and_add1(addr) \
        AO_short_fetch_and_add(addr,1)
#  define AO_HAVE_short_fetch_and_add1
#endif

#if defined(AO_HAVE_short_fetch_and_add1_full)
#  if !defined(AO_HAVE_short_fetch_and_add1_release)
#    define AO_short_fetch_and_add1_release(addr) \
         AO_short_fetch_and_add1_full(addr)
#    define AO_HAVE_short_fetch_and_add1_release
#  endif
#  if !defined(AO_HAVE_short_fetch_and_add1_acquire)
#    define AO_short_fetch_and_add1_acquire(addr) \
         AO_short_fetch_and_add1_full(addr)
#    define AO_HAVE_short_fetch_and_add1_acquire
#  endif
#  if !defined(AO_HAVE_short_fetch_and_add1_write)
#    define AO_short_fetch_and_add1_write(addr) \
         AO_short_fetch_and_add1_full(addr)
#    define AO_HAVE_short_fetch_and_add1_write
#  endif
#  if !defined(AO_HAVE_short_fetch_and_add1_read)
#    define AO_short_fetch_and_add1_read(addr) \
         AO_short_fetch_and_add1_full(addr)
#    define AO_HAVE_short_fetch_and_add1_read
#  endif
#endif /* AO_HAVE_short_fetch_and_add1_full */

#if !defined(AO_HAVE_short_fetch_and_add1) && \
    defined(AO_HAVE_short_fetch_and_add1_release)
#  define AO_short_fetch_and_add1(addr) \
        AO_short_fetch_and_add1_release(addr)
#  define AO_HAVE_short_fetch_and_add1
#endif
#if !defined(AO_HAVE_short_fetch_and_add1) && \
    defined(AO_HAVE_short_fetch_and_add1_acquire)
#  define AO_short_fetch_and_add1(addr) \
        AO_short_fetch_and_add1_acquire(addr)
#  define AO_HAVE_short_fetch_and_add1
#endif
#if !defined(AO_HAVE_short_fetch_and_add1) && \
    defined(AO_HAVE_short_fetch_and_add1_write)
#  define AO_short_fetch_and_add1(addr) \
        AO_short_fetch_and_add1_write(addr)
#  define AO_HAVE_short_fetch_and_add1
#endif
#if !defined(AO_HAVE_short_fetch_and_add1) && \
    defined(AO_HAVE_short_fetch_and_add1_read)
#  define AO_short_fetch_and_add1(addr) \
        AO_short_fetch_and_add1_read(addr)
#  define AO_HAVE_short_fetch_and_add1
#endif

#if defined(AO_HAVE_short_fetch_and_add1_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_short_fetch_and_add1_full)
#  define AO_short_fetch_and_add1_full(addr) \
        (AO_nop_full(), AO_short_fetch_and_add1_acquire(addr))
#  define AO_HAVE_short_fetch_and_add1_full
#endif

#if !defined(AO_HAVE_short_fetch_and_add1_release_write) && \
    defined(AO_HAVE_short_fetch_and_add1_write)
#  define AO_short_fetch_and_add1_release_write(addr) \
        AO_short_fetch_and_add1_write(addr)
#  define AO_HAVE_short_fetch_and_add1_release_write
#endif
#if !defined(AO_HAVE_short_fetch_and_add1_release_write) && \
    defined(AO_HAVE_short_fetch_and_add1_release)
#  define AO_short_fetch_and_add1_release_write(addr) \
        AO_short_fetch_and_add1_release(addr)
#  define AO_HAVE_short_fetch_and_add1_release_write
#endif
#if !defined(AO_HAVE_short_fetch_and_add1_acquire_read) && \
    defined(AO_HAVE_short_fetch_and_add1_read)
#  define AO_short_fetch_and_add1_acquire_read(addr) \
        AO_short_fetch_and_add1_read(addr)
#  define AO_HAVE_short_fetch_and_add1_acquire_read
#endif
#if !defined(AO_HAVE_short_fetch_and_add1_acquire_read) && \
    defined(AO_HAVE_short_fetch_and_add1_acquire)
#  define AO_short_fetch_and_add1_acquire_read(addr) \
        AO_short_fetch_and_add1_acquire(addr)
#  define AO_HAVE_short_fetch_and_add1_acquire_read
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_short_fetch_and_add1_acquire_read)
#    define AO_short_fetch_and_add1_dd_acquire_read(addr) \
        AO_short_fetch_and_add1_acquire_read(addr)
#    define AO_HAVE_short_fetch_and_add1_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_short_fetch_and_add1)
#    define AO_short_fetch_and_add1_dd_acquire_read(addr) \
        AO_short_fetch_and_add1(addr)
#    define AO_HAVE_short_fetch_and_add1_dd_acquire_read
#  endif
#endif

/* short_fetch_and_sub1 */

#if defined(AO_HAVE_short_fetch_and_add_full) &&\
    !defined(AO_HAVE_short_fetch_and_sub1_full)
#  define AO_short_fetch_and_sub1_full(addr) \
        AO_short_fetch_and_add_full(addr,(unsigned short)(-1))
#  define AO_HAVE_short_fetch_and_sub1_full
#endif
#if defined(AO_HAVE_short_fetch_and_add_release) &&\
    !defined(AO_HAVE_short_fetch_and_sub1_release)
#  define AO_short_fetch_and_sub1_release(addr) \
        AO_short_fetch_and_add_release(addr,(unsigned short)(-1))
#  define AO_HAVE_short_fetch_and_sub1_release
#endif
#if defined(AO_HAVE_short_fetch_and_add_acquire) &&\
    !defined(AO_HAVE_short_fetch_and_sub1_acquire)
#  define AO_short_fetch_and_sub1_acquire(addr) \
        AO_short_fetch_and_add_acquire(addr,(unsigned short)(-1))
#  define AO_HAVE_short_fetch_and_sub1_acquire
#endif
#if defined(AO_HAVE_short_fetch_and_add_write) &&\
    !defined(AO_HAVE_short_fetch_and_sub1_write)
#  define AO_short_fetch_and_sub1_write(addr) \
        AO_short_fetch_and_add_write(addr,(unsigned short)(-1))
#  define AO_HAVE_short_fetch_and_sub1_write
#endif
#if defined(AO_HAVE_short_fetch_and_add_read) &&\
    !defined(AO_HAVE_short_fetch_and_sub1_read)
#  define AO_short_fetch_and_sub1_read(addr) \
        AO_short_fetch_and_add_read(addr,(unsigned short)(-1))
#  define AO_HAVE_short_fetch_and_sub1_read
#endif
#if defined(AO_HAVE_short_fetch_and_add_release_write) &&\
    !defined(AO_HAVE_short_fetch_and_sub1_release_write)
#  define AO_short_fetch_and_sub1_release_write(addr) \
        AO_short_fetch_and_add_release_write(addr,(unsigned short)(-1))
#  define AO_HAVE_short_fetch_and_sub1_release_write
#endif
#if defined(AO_HAVE_short_fetch_and_add_acquire_read) &&\
    !defined(AO_HAVE_short_fetch_and_sub1_acquire_read)
#  define AO_short_fetch_and_sub1_acquire_read(addr) \
        AO_short_fetch_and_add_acquire_read(addr,(unsigned short)(-1))
#  define AO_HAVE_short_fetch_and_sub1_acquire_read
#endif
#if defined(AO_HAVE_short_fetch_and_add) &&\
    !defined(AO_HAVE_short_fetch_and_sub1)
#  define AO_short_fetch_and_sub1(addr) \
        AO_short_fetch_and_add(addr,(unsigned short)(-1))
#  define AO_HAVE_short_fetch_and_sub1
#endif

#if defined(AO_HAVE_short_fetch_and_sub1_full)
#  if !defined(AO_HAVE_short_fetch_and_sub1_release)
#    define AO_short_fetch_and_sub1_release(addr) \
         AO_short_fetch_and_sub1_full(addr)
#    define AO_HAVE_short_fetch_and_sub1_release
#  endif
#  if !defined(AO_HAVE_short_fetch_and_sub1_acquire)
#    define AO_short_fetch_and_sub1_acquire(addr) \
         AO_short_fetch_and_sub1_full(addr)
#    define AO_HAVE_short_fetch_and_sub1_acquire
#  endif
#  if !defined(AO_HAVE_short_fetch_and_sub1_write)
#    define AO_short_fetch_and_sub1_write(addr) \
         AO_short_fetch_and_sub1_full(addr)
#    define AO_HAVE_short_fetch_and_sub1_write
#  endif
#  if !defined(AO_HAVE_short_fetch_and_sub1_read)
#    define AO_short_fetch_and_sub1_read(addr) \
         AO_short_fetch_and_sub1_full(addr)
#    define AO_HAVE_short_fetch_and_sub1_read
#  endif
#endif /* AO_HAVE_short_fetch_and_sub1_full */

#if !defined(AO_HAVE_short_fetch_and_sub1) && \
    defined(AO_HAVE_short_fetch_and_sub1_release)
#  define AO_short_fetch_and_sub1(addr) \
        AO_short_fetch_and_sub1_release(addr)
#  define AO_HAVE_short_fetch_and_sub1
#endif
#if !defined(AO_HAVE_short_fetch_and_sub1) && \
    defined(AO_HAVE_short_fetch_and_sub1_acquire)
#  define AO_short_fetch_and_sub1(addr) \
        AO_short_fetch_and_sub1_acquire(addr)
#  define AO_HAVE_short_fetch_and_sub1
#endif
#if !defined(AO_HAVE_short_fetch_and_sub1) && \
    defined(AO_HAVE_short_fetch_and_sub1_write)
#  define AO_short_fetch_and_sub1(addr) \
        AO_short_fetch_and_sub1_write(addr)
#  define AO_HAVE_short_fetch_and_sub1
#endif
#if !defined(AO_HAVE_short_fetch_and_sub1) && \
    defined(AO_HAVE_short_fetch_and_sub1_read)
#  define AO_short_fetch_and_sub1(addr) \
        AO_short_fetch_and_sub1_read(addr)
#  define AO_HAVE_short_fetch_and_sub1
#endif

#if defined(AO_HAVE_short_fetch_and_sub1_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_short_fetch_and_sub1_full)
#  define AO_short_fetch_and_sub1_full(addr) \
        (AO_nop_full(), AO_short_fetch_and_sub1_acquire(addr))
#  define AO_HAVE_short_fetch_and_sub1_full
#endif

#if !defined(AO_HAVE_short_fetch_and_sub1_release_write) && \
    defined(AO_HAVE_short_fetch_and_sub1_write)
#  define AO_short_fetch_and_sub1_release_write(addr) \
        AO_short_fetch_and_sub1_write(addr)
#  define AO_HAVE_short_fetch_and_sub1_release_write
#endif
#if !defined(AO_HAVE_short_fetch_and_sub1_release_write) && \
    defined(AO_HAVE_short_fetch_and_sub1_release)
#  define AO_short_fetch_and_sub1_release_write(addr) \
        AO_short_fetch_and_sub1_release(addr)
#  define AO_HAVE_short_fetch_and_sub1_release_write
#endif
#if !defined(AO_HAVE_short_fetch_and_sub1_acquire_read) && \
    defined(AO_HAVE_short_fetch_and_sub1_read)
#  define AO_short_fetch_and_sub1_acquire_read(addr) \
        AO_short_fetch_and_sub1_read(addr)
#  define AO_HAVE_short_fetch_and_sub1_acquire_read
#endif
#if !defined(AO_HAVE_short_fetch_and_sub1_acquire_read) && \
    defined(AO_HAVE_short_fetch_and_sub1_acquire)
#  define AO_short_fetch_and_sub1_acquire_read(addr) \
        AO_short_fetch_and_sub1_acquire(addr)
#  define AO_HAVE_short_fetch_and_sub1_acquire_read
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_short_fetch_and_sub1_acquire_read)
#    define AO_short_fetch_and_sub1_dd_acquire_read(addr) \
        AO_short_fetch_and_sub1_acquire_read(addr)
#    define AO_HAVE_short_fetch_and_sub1_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_short_fetch_and_sub1)
#    define AO_short_fetch_and_sub1_dd_acquire_read(addr) \
        AO_short_fetch_and_sub1(addr)
#    define AO_HAVE_short_fetch_and_sub1_dd_acquire_read
#  endif
#endif

/* int_load */
#if defined(AO_HAVE_int_load_acquire) && !defined(AO_HAVE_int_load)
#  define AO_int_load(addr) AO_int_load_acquire(addr)
#  define AO_HAVE_int_load
#endif

#if defined(AO_HAVE_int_load_full) && !defined(AO_HAVE_int_load_acquire)
#  define AO_int_load_acquire(addr) AO_int_load_full(addr)
#  define AO_HAVE_int_load_acquire
#endif

#if defined(AO_HAVE_int_load_full) && !defined(AO_HAVE_int_load_read)
#  define AO_int_load_read(addr) AO_int_load_full(addr)
#  define AO_HAVE_int_load_read
#endif

#if !defined(AO_HAVE_int_load_acquire_read) && defined(AO_HAVE_int_load_acquire)
#  define AO_int_load_acquire_read(addr) AO_int_load_acquire(addr)
#  define AO_HAVE_int_load_acquire_read
#endif

#if defined(AO_HAVE_int_load) && defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_int_load_acquire)
   AO_INLINE unsigned int
   AO_int_load_acquire(const volatile unsigned int *addr)
   {
     unsigned int result = AO_int_load(addr);
     /* Acquire barrier would be useless, since the load could be delayed  */
     /* beyond it.                                                         */
     AO_nop_full();
     return result;
   }
#  define AO_HAVE_int_load_acquire
#endif

#if defined(AO_HAVE_int_load) && defined(AO_HAVE_nop_read) && \
    !defined(AO_HAVE_int_load_read)
   AO_INLINE unsigned int
   AO_int_load_read(const volatile unsigned int *addr)
   {
     unsigned int result = AO_int_load(addr);
     /* Acquire barrier would be useless, since the load could be delayed  */
     /* beyond it.                                                         */
     AO_nop_read();
     return result;
   }
#  define AO_HAVE_int_load_read
#endif

#if defined(AO_HAVE_int_load_acquire) && defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_int_load_full)
#  define AO_int_load_full(addr) (AO_nop_full(), AO_int_load_acquire(addr))
#  define AO_HAVE_int_load_full
#endif

#if !defined(AO_HAVE_int_load_acquire_read) && defined(AO_HAVE_int_load_read)
#  define AO_int_load_acquire_read(addr) AO_int_load_read(addr)
#  define AO_HAVE_int_load_acquire_read
#endif

#if defined(AO_HAVE_int_load_acquire_read) && !defined(AO_HAVE_int_load)
#  define AO_int_load(addr) AO_int_load_acquire_read(addr)
#  define AO_HAVE_int_load
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_int_load_acquire_read)
#    define AO_int_load_dd_acquire_read(addr) \
        AO_int_load_acquire_read(addr)
#    define AO_HAVE_int_load_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_int_load)
#    define AO_int_load_dd_acquire_read(addr) \
        AO_int_load(addr)
#    define AO_HAVE_int_load_dd_acquire_read
#  endif
#endif


/* int_store */

#if defined(AO_HAVE_int_store_release) && !defined(AO_HAVE_int_store)
#  define AO_int_store(addr, val) AO_int_store_release(addr,val)
#  define AO_HAVE_int_store
#endif

#if defined(AO_HAVE_int_store_full) && !defined(AO_HAVE_int_store_release)
#  define AO_int_store_release(addr,val) AO_int_store_full(addr,val)
#  define AO_HAVE_int_store_release
#endif

#if defined(AO_HAVE_int_store_full) && !defined(AO_HAVE_int_store_write)
#  define AO_int_store_write(addr,val) AO_int_store_full(addr,val)
#  define AO_HAVE_int_store_write
#endif

#if defined(AO_HAVE_int_store_release) && \
        !defined(AO_HAVE_int_store_release_write)
#  define AO_int_store_release_write(addr, val) \
        AO_int_store_release(addr,val)
#  define AO_HAVE_int_store_release_write
#endif

#if defined(AO_HAVE_int_store_write) && !defined(AO_HAVE_int_store)
#  define AO_int_store(addr, val) AO_int_store_write(addr,val)
#  define AO_HAVE_int_store
#endif

#if defined(AO_HAVE_int_store) && defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_int_store_release)
#  define AO_int_store_release(addr,val) \
        (AO_nop_full(), AO_int_store(addr,val))
#  define AO_HAVE_int_store_release
#endif

#if defined(AO_HAVE_nop_write) && defined(AO_HAVE_int_store) && \
     !defined(AO_HAVE_int_store_write)
#  define AO_int_store_write(addr, val) \
        (AO_nop_write(), AO_int_store(addr,val))
#  define AO_HAVE_int_store_write
#endif

#if defined(AO_HAVE_int_store_write) && \
     !defined(AO_HAVE_int_store_release_write)
#  define AO_int_store_release_write(addr, val) AO_int_store_write(addr,val)
#  define AO_HAVE_int_store_release_write
#endif

#if defined(AO_HAVE_int_store_release) && defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_int_store_full)
#  define AO_int_store_full(addr, val) \
        (AO_int_store_release(addr, val), AO_nop_full())
#  define AO_HAVE_int_store_full
#endif


/* int_fetch_and_add */
#if defined(AO_HAVE_int_compare_and_swap_full) && \
    !defined(AO_HAVE_int_fetch_and_add_full)
   AO_INLINE AO_t
   AO_int_fetch_and_add_full(volatile unsigned int *addr,
                               unsigned int incr)
   {
     unsigned int old;
     do
       {
         old = *addr;
       }
     while (!AO_int_compare_and_swap_full(addr, old, old+incr));
     return old;
   }
#  define AO_HAVE_int_fetch_and_add_full
#endif

#if defined(AO_HAVE_int_compare_and_swap_acquire) && \
    !defined(AO_HAVE_int_fetch_and_add_acquire)
   AO_INLINE AO_t
   AO_int_fetch_and_add_acquire(volatile unsigned int *addr,
                                  unsigned int incr)
   {
     unsigned int old;
     do
       {
         old = *addr;
       }
     while (!AO_int_compare_and_swap_acquire(addr, old, old+incr));
     return old;
   }
#  define AO_HAVE_int_fetch_and_add_acquire
#endif

#if defined(AO_HAVE_int_compare_and_swap_release) && \
    !defined(AO_HAVE_int_fetch_and_add_release)
   AO_INLINE AO_t
   AO_int_fetch_and_add_release(volatile unsigned int *addr,
                                  unsigned int incr)
   {
     unsigned int old;
     do
       {
         old = *addr;
       }
     while (!AO_int_compare_and_swap_release(addr, old, old+incr));
     return old;
   }
#  define AO_HAVE_int_fetch_and_add_release
#endif

#if defined(AO_HAVE_int_fetch_and_add_full)
#  if !defined(AO_HAVE_int_fetch_and_add_release)
#    define AO_int_fetch_and_add_release(addr, val) \
         AO_int_fetch_and_add_full(addr, val)
#    define AO_HAVE_int_fetch_and_add_release
#  endif
#  if !defined(AO_HAVE_int_fetch_and_add_acquire)
#    define AO_int_fetch_and_add_acquire(addr, val) \
         AO_int_fetch_and_add_full(addr, val)
#    define AO_HAVE_int_fetch_and_add_acquire
#  endif
#  if !defined(AO_HAVE_int_fetch_and_add_write)
#    define AO_int_fetch_and_add_write(addr, val) \
         AO_int_fetch_and_add_full(addr, val)
#    define AO_HAVE_int_fetch_and_add_write
#  endif
#  if !defined(AO_HAVE_int_fetch_and_add_read)
#    define AO_int_fetch_and_add_read(addr, val) \
         AO_int_fetch_and_add_full(addr, val)
#    define AO_HAVE_int_fetch_and_add_read
#  endif
#endif /* AO_HAVE_int_fetch_and_add_full */

#if !defined(AO_HAVE_int_fetch_and_add) && \
    defined(AO_HAVE_int_fetch_and_add_release)
#  define AO_int_fetch_and_add(addr, val) \
        AO_int_fetch_and_add_release(addr, val)
#  define AO_HAVE_int_fetch_and_add
#endif
#if !defined(AO_HAVE_int_fetch_and_add) && \
    defined(AO_HAVE_int_fetch_and_add_acquire)
#  define AO_int_fetch_and_add(addr, val) \
        AO_int_fetch_and_add_acquire(addr, val)
#  define AO_HAVE_int_fetch_and_add
#endif
#if !defined(AO_HAVE_int_fetch_and_add) && \
    defined(AO_HAVE_int_fetch_and_add_write)
#  define AO_int_fetch_and_add(addr, val) \
        AO_int_fetch_and_add_write(addr, val)
#  define AO_HAVE_int_fetch_and_add
#endif
#if !defined(AO_HAVE_int_fetch_and_add) && \
    defined(AO_HAVE_int_fetch_and_add_read)
#  define AO_int_fetch_and_add(addr, val) \
        AO_int_fetch_and_add_read(addr, val)
#  define AO_HAVE_int_fetch_and_add
#endif

#if defined(AO_HAVE_int_fetch_and_add_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_int_fetch_and_add_full)
#  define AO_int_fetch_and_add_full(addr, val) \
        (AO_nop_full(), AO_int_fetch_and_add_acquire(addr, val))
#endif

#if !defined(AO_HAVE_int_fetch_and_add_release_write) && \
    defined(AO_HAVE_int_fetch_and_add_write)
#  define AO_int_fetch_and_add_release_write(addr, val) \
        AO_int_fetch_and_add_write(addr, val)
#  define AO_HAVE_int_fetch_and_add_release_write
#endif
#if !defined(AO_HAVE_int_fetch_and_add_release_write) && \
    defined(AO_HAVE_int_fetch_and_add_release)
#  define AO_int_fetch_and_add_release_write(addr, val) \
        AO_int_fetch_and_add_release(addr, val)
#  define AO_HAVE_int_fetch_and_add_release_write
#endif
#if !defined(AO_HAVE_int_fetch_and_add_acquire_read) && \
    defined(AO_HAVE_int_fetch_and_add_read)
#  define AO_int_fetch_and_add_acquire_read(addr, val) \
        AO_int_fetch_and_add_read(addr, val)
#  define AO_HAVE_int_fetch_and_add_acquire_read
#endif
#if !defined(AO_HAVE_int_fetch_and_add_acquire_read) && \
    defined(AO_HAVE_int_fetch_and_add_acquire)
#  define AO_int_fetch_and_add_acquire_read(addr, val) \
        AO_int_fetch_and_add_acquire(addr, val)
#  define AO_HAVE_int_fetch_and_add_acquire_read
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_int_fetch_and_add_acquire_read)
#    define AO_int_fetch_and_add_dd_acquire_read(addr, val) \
        AO_int_fetch_and_add_acquire_read(addr, val)
#    define AO_HAVE_int_fetch_and_add_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_int_fetch_and_add)
#    define AO_int_fetch_and_add_dd_acquire_read(addr, val) \
        AO_int_fetch_and_add(addr, val)
#    define AO_HAVE_int_fetch_and_add_dd_acquire_read
#  endif
#endif

/* int_fetch_and_add1 */

#if defined(AO_HAVE_int_fetch_and_add_full) &&\
    !defined(AO_HAVE_int_fetch_and_add1_full)
#  define AO_int_fetch_and_add1_full(addr) \
        AO_int_fetch_and_add_full(addr,1)
#  define AO_HAVE_int_fetch_and_add1_full
#endif
#if defined(AO_HAVE_int_fetch_and_add_release) &&\
    !defined(AO_HAVE_int_fetch_and_add1_release)
#  define AO_int_fetch_and_add1_release(addr) \
        AO_int_fetch_and_add_release(addr,1)
#  define AO_HAVE_int_fetch_and_add1_release
#endif
#if defined(AO_HAVE_int_fetch_and_add_acquire) &&\
    !defined(AO_HAVE_int_fetch_and_add1_acquire)
#  define AO_int_fetch_and_add1_acquire(addr) \
        AO_int_fetch_and_add_acquire(addr,1)
#  define AO_HAVE_int_fetch_and_add1_acquire
#endif
#if defined(AO_HAVE_int_fetch_and_add_write) &&\
    !defined(AO_HAVE_int_fetch_and_add1_write)
#  define AO_int_fetch_and_add1_write(addr) \
        AO_int_fetch_and_add_write(addr,1)
#  define AO_HAVE_int_fetch_and_add1_write
#endif
#if defined(AO_HAVE_int_fetch_and_add_read) &&\
    !defined(AO_HAVE_int_fetch_and_add1_read)
#  define AO_int_fetch_and_add1_read(addr) \
        AO_int_fetch_and_add_read(addr,1)
#  define AO_HAVE_int_fetch_and_add1_read
#endif
#if defined(AO_HAVE_int_fetch_and_add_release_write) &&\
    !defined(AO_HAVE_int_fetch_and_add1_release_write)
#  define AO_int_fetch_and_add1_release_write(addr) \
        AO_int_fetch_and_add_release_write(addr,1)
#  define AO_HAVE_int_fetch_and_add1_release_write
#endif
#if defined(AO_HAVE_int_fetch_and_add_acquire_read) &&\
    !defined(AO_HAVE_int_fetch_and_add1_acquire_read)
#  define AO_int_fetch_and_add1_acquire_read(addr) \
        AO_int_fetch_and_add_acquire_read(addr,1)
#  define AO_HAVE_int_fetch_and_add1_acquire_read
#endif
#if defined(AO_HAVE_int_fetch_and_add) &&\
    !defined(AO_HAVE_int_fetch_and_add1)
#  define AO_int_fetch_and_add1(addr) \
        AO_int_fetch_and_add(addr,1)
#  define AO_HAVE_int_fetch_and_add1
#endif

#if defined(AO_HAVE_int_fetch_and_add1_full)
#  if !defined(AO_HAVE_int_fetch_and_add1_release)
#    define AO_int_fetch_and_add1_release(addr) \
         AO_int_fetch_and_add1_full(addr)
#    define AO_HAVE_int_fetch_and_add1_release
#  endif
#  if !defined(AO_HAVE_int_fetch_and_add1_acquire)
#    define AO_int_fetch_and_add1_acquire(addr) \
         AO_int_fetch_and_add1_full(addr)
#    define AO_HAVE_int_fetch_and_add1_acquire
#  endif
#  if !defined(AO_HAVE_int_fetch_and_add1_write)
#    define AO_int_fetch_and_add1_write(addr) \
         AO_int_fetch_and_add1_full(addr)
#    define AO_HAVE_int_fetch_and_add1_write
#  endif
#  if !defined(AO_HAVE_int_fetch_and_add1_read)
#    define AO_int_fetch_and_add1_read(addr) \
         AO_int_fetch_and_add1_full(addr)
#    define AO_HAVE_int_fetch_and_add1_read
#  endif
#endif /* AO_HAVE_int_fetch_and_add1_full */

#if !defined(AO_HAVE_int_fetch_and_add1) && \
    defined(AO_HAVE_int_fetch_and_add1_release)
#  define AO_int_fetch_and_add1(addr) \
        AO_int_fetch_and_add1_release(addr)
#  define AO_HAVE_int_fetch_and_add1
#endif
#if !defined(AO_HAVE_int_fetch_and_add1) && \
    defined(AO_HAVE_int_fetch_and_add1_acquire)
#  define AO_int_fetch_and_add1(addr) \
        AO_int_fetch_and_add1_acquire(addr)
#  define AO_HAVE_int_fetch_and_add1
#endif
#if !defined(AO_HAVE_int_fetch_and_add1) && \
    defined(AO_HAVE_int_fetch_and_add1_write)
#  define AO_int_fetch_and_add1(addr) \
        AO_int_fetch_and_add1_write(addr)
#  define AO_HAVE_int_fetch_and_add1
#endif
#if !defined(AO_HAVE_int_fetch_and_add1) && \
    defined(AO_HAVE_int_fetch_and_add1_read)
#  define AO_int_fetch_and_add1(addr) \
        AO_int_fetch_and_add1_read(addr)
#  define AO_HAVE_int_fetch_and_add1
#endif

#if defined(AO_HAVE_int_fetch_and_add1_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_int_fetch_and_add1_full)
#  define AO_int_fetch_and_add1_full(addr) \
        (AO_nop_full(), AO_int_fetch_and_add1_acquire(addr))
#  define AO_HAVE_int_fetch_and_add1_full
#endif

#if !defined(AO_HAVE_int_fetch_and_add1_release_write) && \
    defined(AO_HAVE_int_fetch_and_add1_write)
#  define AO_int_fetch_and_add1_release_write(addr) \
        AO_int_fetch_and_add1_write(addr)
#  define AO_HAVE_int_fetch_and_add1_release_write
#endif
#if !defined(AO_HAVE_int_fetch_and_add1_release_write) && \
    defined(AO_HAVE_int_fetch_and_add1_release)
#  define AO_int_fetch_and_add1_release_write(addr) \
        AO_int_fetch_and_add1_release(addr)
#  define AO_HAVE_int_fetch_and_add1_release_write
#endif
#if !defined(AO_HAVE_int_fetch_and_add1_acquire_read) && \
    defined(AO_HAVE_int_fetch_and_add1_read)
#  define AO_int_fetch_and_add1_acquire_read(addr) \
        AO_int_fetch_and_add1_read(addr)
#  define AO_HAVE_int_fetch_and_add1_acquire_read
#endif
#if !defined(AO_HAVE_int_fetch_and_add1_acquire_read) && \
    defined(AO_HAVE_int_fetch_and_add1_acquire)
#  define AO_int_fetch_and_add1_acquire_read(addr) \
        AO_int_fetch_and_add1_acquire(addr)
#  define AO_HAVE_int_fetch_and_add1_acquire_read
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_int_fetch_and_add1_acquire_read)
#    define AO_int_fetch_and_add1_dd_acquire_read(addr) \
        AO_int_fetch_and_add1_acquire_read(addr)
#    define AO_HAVE_int_fetch_and_add1_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_int_fetch_and_add1)
#    define AO_int_fetch_and_add1_dd_acquire_read(addr) \
        AO_int_fetch_and_add1(addr)
#    define AO_HAVE_int_fetch_and_add1_dd_acquire_read
#  endif
#endif

/* int_fetch_and_sub1 */

#if defined(AO_HAVE_int_fetch_and_add_full) &&\
    !defined(AO_HAVE_int_fetch_and_sub1_full)
#  define AO_int_fetch_and_sub1_full(addr) \
        AO_int_fetch_and_add_full(addr,(unsigned int)(-1))
#  define AO_HAVE_int_fetch_and_sub1_full
#endif
#if defined(AO_HAVE_int_fetch_and_add_release) &&\
    !defined(AO_HAVE_int_fetch_and_sub1_release)
#  define AO_int_fetch_and_sub1_release(addr) \
        AO_int_fetch_and_add_release(addr,(unsigned int)(-1))
#  define AO_HAVE_int_fetch_and_sub1_release
#endif
#if defined(AO_HAVE_int_fetch_and_add_acquire) &&\
    !defined(AO_HAVE_int_fetch_and_sub1_acquire)
#  define AO_int_fetch_and_sub1_acquire(addr) \
        AO_int_fetch_and_add_acquire(addr,(unsigned int)(-1))
#  define AO_HAVE_int_fetch_and_sub1_acquire
#endif
#if defined(AO_HAVE_int_fetch_and_add_write) &&\
    !defined(AO_HAVE_int_fetch_and_sub1_write)
#  define AO_int_fetch_and_sub1_write(addr) \
        AO_int_fetch_and_add_write(addr,(unsigned int)(-1))
#  define AO_HAVE_int_fetch_and_sub1_write
#endif
#if defined(AO_HAVE_int_fetch_and_add_read) &&\
    !defined(AO_HAVE_int_fetch_and_sub1_read)
#  define AO_int_fetch_and_sub1_read(addr) \
        AO_int_fetch_and_add_read(addr,(unsigned int)(-1))
#  define AO_HAVE_int_fetch_and_sub1_read
#endif
#if defined(AO_HAVE_int_fetch_and_add_release_write) &&\
    !defined(AO_HAVE_int_fetch_and_sub1_release_write)
#  define AO_int_fetch_and_sub1_release_write(addr) \
        AO_int_fetch_and_add_release_write(addr,(unsigned int)(-1))
#  define AO_HAVE_int_fetch_and_sub1_release_write
#endif
#if defined(AO_HAVE_int_fetch_and_add_acquire_read) &&\
    !defined(AO_HAVE_int_fetch_and_sub1_acquire_read)
#  define AO_int_fetch_and_sub1_acquire_read(addr) \
        AO_int_fetch_and_add_acquire_read(addr,(unsigned int)(-1))
#  define AO_HAVE_int_fetch_and_sub1_acquire_read
#endif
#if defined(AO_HAVE_int_fetch_and_add) &&\
    !defined(AO_HAVE_int_fetch_and_sub1)
#  define AO_int_fetch_and_sub1(addr) \
        AO_int_fetch_and_add(addr,(unsigned int)(-1))
#  define AO_HAVE_int_fetch_and_sub1
#endif

#if defined(AO_HAVE_int_fetch_and_sub1_full)
#  if !defined(AO_HAVE_int_fetch_and_sub1_release)
#    define AO_int_fetch_and_sub1_release(addr) \
         AO_int_fetch_and_sub1_full(addr)
#    define AO_HAVE_int_fetch_and_sub1_release
#  endif
#  if !defined(AO_HAVE_int_fetch_and_sub1_acquire)
#    define AO_int_fetch_and_sub1_acquire(addr) \
         AO_int_fetch_and_sub1_full(addr)
#    define AO_HAVE_int_fetch_and_sub1_acquire
#  endif
#  if !defined(AO_HAVE_int_fetch_and_sub1_write)
#    define AO_int_fetch_and_sub1_write(addr) \
         AO_int_fetch_and_sub1_full(addr)
#    define AO_HAVE_int_fetch_and_sub1_write
#  endif
#  if !defined(AO_HAVE_int_fetch_and_sub1_read)
#    define AO_int_fetch_and_sub1_read(addr) \
         AO_int_fetch_and_sub1_full(addr)
#    define AO_HAVE_int_fetch_and_sub1_read
#  endif
#endif /* AO_HAVE_int_fetch_and_sub1_full */

#if !defined(AO_HAVE_int_fetch_and_sub1) && \
    defined(AO_HAVE_int_fetch_and_sub1_release)
#  define AO_int_fetch_and_sub1(addr) \
        AO_int_fetch_and_sub1_release(addr)
#  define AO_HAVE_int_fetch_and_sub1
#endif
#if !defined(AO_HAVE_int_fetch_and_sub1) && \
    defined(AO_HAVE_int_fetch_and_sub1_acquire)
#  define AO_int_fetch_and_sub1(addr) \
        AO_int_fetch_and_sub1_acquire(addr)
#  define AO_HAVE_int_fetch_and_sub1
#endif
#if !defined(AO_HAVE_int_fetch_and_sub1) && \
    defined(AO_HAVE_int_fetch_and_sub1_write)
#  define AO_int_fetch_and_sub1(addr) \
        AO_int_fetch_and_sub1_write(addr)
#  define AO_HAVE_int_fetch_and_sub1
#endif
#if !defined(AO_HAVE_int_fetch_and_sub1) && \
    defined(AO_HAVE_int_fetch_and_sub1_read)
#  define AO_int_fetch_and_sub1(addr) \
        AO_int_fetch_and_sub1_read(addr)
#  define AO_HAVE_int_fetch_and_sub1
#endif

#if defined(AO_HAVE_int_fetch_and_sub1_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_int_fetch_and_sub1_full)
#  define AO_int_fetch_and_sub1_full(addr) \
        (AO_nop_full(), AO_int_fetch_and_sub1_acquire(addr))
#  define AO_HAVE_int_fetch_and_sub1_full
#endif

#if !defined(AO_HAVE_int_fetch_and_sub1_release_write) && \
    defined(AO_HAVE_int_fetch_and_sub1_write)
#  define AO_int_fetch_and_sub1_release_write(addr) \
        AO_int_fetch_and_sub1_write(addr)
#  define AO_HAVE_int_fetch_and_sub1_release_write
#endif
#if !defined(AO_HAVE_int_fetch_and_sub1_release_write) && \
    defined(AO_HAVE_int_fetch_and_sub1_release)
#  define AO_int_fetch_and_sub1_release_write(addr) \
        AO_int_fetch_and_sub1_release(addr)
#  define AO_HAVE_int_fetch_and_sub1_release_write
#endif
#if !defined(AO_HAVE_int_fetch_and_sub1_acquire_read) && \
    defined(AO_HAVE_int_fetch_and_sub1_read)
#  define AO_int_fetch_and_sub1_acquire_read(addr) \
        AO_int_fetch_and_sub1_read(addr)
#  define AO_HAVE_int_fetch_and_sub1_acquire_read
#endif
#if !defined(AO_HAVE_int_fetch_and_sub1_acquire_read) && \
    defined(AO_HAVE_int_fetch_and_sub1_acquire)
#  define AO_int_fetch_and_sub1_acquire_read(addr) \
        AO_int_fetch_and_sub1_acquire(addr)
#  define AO_HAVE_int_fetch_and_sub1_acquire_read
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_int_fetch_and_sub1_acquire_read)
#    define AO_int_fetch_and_sub1_dd_acquire_read(addr) \
        AO_int_fetch_and_sub1_acquire_read(addr)
#    define AO_HAVE_int_fetch_and_sub1_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_int_fetch_and_sub1)
#    define AO_int_fetch_and_sub1_dd_acquire_read(addr) \
        AO_int_fetch_and_sub1(addr)
#    define AO_HAVE_int_fetch_and_sub1_dd_acquire_read
#  endif
#endif


================================================
FILE: stms/tinystm/src/atomic_ops/generalize.h
================================================
/*
 * Copyright (c) 2003-2004 Hewlett-Packard Development Company, L.P.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

/*
 * Generalize atomic operations for atomic_ops.h.
 * Should not be included directly.
 *
 * We make no attempt to define useless operations, such as
 * AO_nop_acquire
 * AO_nop_release
 *
 * We have also so far neglected to define some others, which
 * do not appear likely to be useful, e.g. stores with acquire
 * or read barriers.
 *
 * This file is sometimes included twice by atomic_ops.h.
 * All definitions include explicit checks that we are not replacing
 * an earlier definition.  In general, more desirable expansions
 * appear earlier so that we are more likely to use them.
 *
 * We only make safe generalizations, except that by default we define
 * the ...dd_acquire_read operations to be equivalent to those without
 * a barrier.  On platforms for which this is unsafe, the platform-specific
 * file must define AO_NO_DD_ORDERING.
 */

#ifndef ATOMIC_OPS_H
# error Atomic_ops_generalize.h should not be included directly.
#endif

#if AO_CHAR_TS_T
# define AO_TS_COMPARE_AND_SWAP_FULL(a,o,n) \
         AO_char_compare_and_swap_full(a,o,n)
# define AO_TS_COMPARE_AND_SWAP_ACQUIRE(a,o,n) \
         AO_char_compare_and_swap_acquire(a,o,n)
# define AO_TS_COMPARE_AND_SWAP_RELEASE(a,o,n) \
         AO_char_compare_and_swap_release(a,o,n)
# define AO_TS_COMPARE_AND_SWAP(a,o,n) \
         AO_char_compare_and_swap(a,o,n)
#endif

#if AO_AO_TS_T
# define AO_TS_COMPARE_AND_SWAP_FULL(a,o,n) \
         AO_compare_and_swap_full(a,o,n)
# define AO_TS_COMPARE_AND_SWAP_ACQUIRE(a,o,n) \
         AO_compare_and_swap_acquire(a,o,n)
# define AO_TS_COMPARE_AND_SWAP_RELEASE(a,o,n) \
         AO_compare_and_swap_release(a,o,n)
# define AO_TS_COMPARE_AND_SWAP(a,o,n) \
         AO_compare_and_swap(a,o,n)
#endif

/* Generate test_and_set_full, if necessary and possible.       */
#if !defined(AO_HAVE_test_and_set) && \
    !defined(AO_HAVE_test_and_set_release) && \
    !defined(AO_HAVE_test_and_set_acquire) && \
    !defined(AO_HAVE_test_and_set_read) && \
    !defined(AO_HAVE_test_and_set_full)
#  if AO_AO_TS_T && defined(AO_HAVE_compare_and_swap_full) || \
      AO_CHAR_TS_T && defined(AO_HAVE_char_compare_and_swap_full)
     AO_INLINE AO_TS_VAL_t
     AO_test_and_set_full(volatile AO_TS_t *addr)
     {
       if (AO_TS_COMPARE_AND_SWAP_FULL(addr, AO_TS_CLEAR, AO_TS_SET))
         return AO_TS_CLEAR;
       else
         return AO_TS_SET;
     }
#    define AO_HAVE_test_and_set_full
#  endif /* AO_HAVE_compare_and_swap_full */

#  if AO_AO_TS_T && defined(AO_HAVE_compare_and_swap_acquire) || \
      AO_CHAR_TS_T && defined(AO_HAVE_char_compare_and_swap_acquire)
     AO_INLINE AO_TS_VAL_t
     AO_test_and_set_acquire(volatile AO_TS_t *addr)
     {
       if (AO_TS_COMPARE_AND_SWAP_ACQUIRE(addr, AO_TS_CLEAR, AO_TS_SET))
         return AO_TS_CLEAR;
       else
         return AO_TS_SET;
     }
#    define AO_HAVE_test_and_set_acquire
#  endif /* AO_HAVE_compare_and_swap_acquire */

#  if AO_AO_TS_T && defined(AO_HAVE_compare_and_swap_release) || \
      AO_CHAR_TS_T && defined(AO_HAVE_char_compare_and_swap_release)
     AO_INLINE AO_TS_VAL_t
     AO_test_and_set_release(volatile AO_TS_t *addr)
     {
       if (AO_TS_COMPARE_AND_SWAP_RELEASE(addr, AO_TS_CLEAR, AO_TS_SET))
         return AO_TS_CLEAR;
       else
         return AO_TS_SET;
     }
#    define AO_HAVE_test_and_set_release
#  endif /* AO_HAVE_compare_and_swap_release */

#  if AO_AO_TS_T && defined(AO_HAVE_compare_and_swap) || \
      AO_CHAR_TS_T && defined(AO_HAVE_char_compare_and_swap)
     AO_INLINE AO_TS_VAL_t
     AO_test_and_set(volatile AO_TS_t *addr)
     {
       if (AO_TS_COMPARE_AND_SWAP(addr, AO_TS_CLEAR, AO_TS_SET))
         return AO_TS_CLEAR;
       else
         return AO_TS_SET;
     }
#    define AO_HAVE_test_and_set
#  endif /* AO_HAVE_compare_and_swap */

#  if defined(AO_HAVE_test_and_set) && defined(AO_HAVE_nop_full) \
      && !defined(AO_HAVE_test_and_set_acquire)
     AO_INLINE AO_TS_VAL_t
     AO_test_and_set_acquire(volatile AO_TS_t *addr)
     {
       AO_TS_VAL_t result = AO_test_and_set(addr);
       AO_nop_full();
       return result;
     }
#    define AO_HAVE_test_and_set_acquire
#  endif

#endif /* No prior test and set */

/* Nop */
#if !defined(AO_HAVE_nop)
   AO_INLINE void AO_nop(void) {}
#  define AO_HAVE_nop
#endif

#if defined(AO_HAVE_test_and_set_full) && !defined(AO_HAVE_nop_full)
   AO_INLINE void
   AO_nop_full(void)
   {
     AO_TS_t dummy = AO_TS_INITIALIZER;
     AO_test_and_set_full(&dummy);
   }
#  define AO_HAVE_nop_full
#endif

#if defined(AO_HAVE_nop_acquire)
#  error AO_nop_acquire is useless: dont define.
#endif
#if defined(AO_HAVE_nop_release)
#  error AO_nop_release is useless: dont define.
#endif

#if defined(AO_HAVE_nop_full) && !defined(AO_HAVE_nop_read)
#  define AO_nop_read() AO_nop_full()
#  define AO_HAVE_nop_read
#endif

#if defined(AO_HAVE_nop_full) && !defined(AO_HAVE_nop_write)
#  define AO_nop_write() AO_nop_full()
#  define AO_HAVE_nop_write
#endif

/* Load */
#if defined(AO_HAVE_load_full) && !defined(AO_HAVE_load_acquire)
#  define AO_load_acquire(addr) AO_load_full(addr)
#  define AO_HAVE_load_acquire
#endif

#if defined(AO_HAVE_load_acquire) && !defined(AO_HAVE_load)
#  define AO_load(addr) AO_load_acquire(addr)
#  define AO_HAVE_load
#endif

#if defined(AO_HAVE_load_full) && !defined(AO_HAVE_load_read)
#  define AO_load_read(addr) AO_load_full(addr)
#  define AO_HAVE_load_read
#endif

#if !defined(AO_HAVE_load_acquire_read) && defined(AO_HAVE_load_acquire)
#  define AO_load_acquire_read(addr) AO_load_acquire(addr)
#  define AO_HAVE_load_acquire_read
#endif

#if defined(AO_HAVE_load) && defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_load_acquire)
   AO_INLINE AO_t
   AO_load_acquire(const volatile AO_t *addr)
   {
     AO_t result = AO_load(addr);
     /* Acquire barrier would be useless, since the load could be delayed  */
     /* beyond it.                                                         */
     AO_nop_full();
     return result;
   }
#  define AO_HAVE_load_acquire
#endif

#if defined(AO_HAVE_load) && defined(AO_HAVE_nop_read) && \
    !defined(AO_HAVE_load_read)
   AO_INLINE AO_t
   AO_load_read(const volatile AO_t *addr)
   {
     AO_t result = AO_load(addr);
     /* Acquire barrier would be useless, since the load could be delayed  */
     /* beyond it.                                                         */
     AO_nop_read();
     return result;
   }
#  define AO_HAVE_load_read
#endif

#if defined(AO_HAVE_load_acquire) && defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_load_full)
#  define AO_load_full(addr) (AO_nop_full(), AO_load_acquire(addr))
#  define AO_HAVE_load_full
#endif

#if !defined(AO_HAVE_load_acquire_read) && defined(AO_HAVE_load_read)
#  define AO_load_acquire_read(addr) AO_load_read(addr)
#  define AO_HAVE_load_acquire_read
#endif

#if defined(AO_HAVE_load_acquire_read) && !defined(AO_HAVE_load)
#  define AO_load(addr) AO_load_acquire_read(addr)
#  define AO_HAVE_load
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_load_acquire_read)
#    define AO_load_dd_acquire_read(addr) AO_load_acquire_read(addr)
#    define AO_HAVE_load_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_load)
#    define AO_load_dd_acquire_read(addr) AO_load(addr)
#    define AO_HAVE_load_dd_acquire_read
#  endif
#endif


/* Store */

#if defined(AO_HAVE_store_full) && !defined(AO_HAVE_store_release)
#  define AO_store_release(addr,val) AO_store_full(addr,val)
#  define AO_HAVE_store_release
#endif

#if defined(AO_HAVE_store_release) && !defined(AO_HAVE_store)
#  define AO_store(addr, val) AO_store_release(addr,val)
#  define AO_HAVE_store
#endif

#if defined(AO_HAVE_store_full) && !defined(AO_HAVE_store_write)
#  define AO_store_write(addr,val) AO_store_full(addr,val)
#  define AO_HAVE_store_write
#endif

#if defined(AO_HAVE_store_release) && !defined(AO_HAVE_store_release_write)
#  define AO_store_release_write(addr, val) AO_store_release(addr,val)
#  define AO_HAVE_store_release_write
#endif

#if defined(AO_HAVE_store_write) && !defined(AO_HAVE_store)
#  define AO_store(addr, val) AO_store_write(addr,val)
#  define AO_HAVE_store
#endif

#if defined(AO_HAVE_store) && defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_store_release)
#  define AO_store_release(addr,val) (AO_nop_full(), AO_store(addr,val))
#  define AO_HAVE_store_release
#endif

#if defined(AO_HAVE_nop_write) && defined(AO_HAVE_store) && \
     !defined(AO_HAVE_store_write)
#  define AO_store_write(addr, val) (AO_nop_write(), AO_store(addr,val))
#  define AO_HAVE_store_write
#endif

#if defined(AO_HAVE_store_write) && !defined(AO_HAVE_store_release_write)
#  define AO_store_release_write(addr, val) AO_store_write(addr,val)
#  define AO_HAVE_store_release_write
#endif

#if defined(AO_HAVE_store_release) && defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_store_full)
#  define AO_store_full(addr, val) (AO_store_release(addr, val), AO_nop_full())
#  define AO_HAVE_store_full
#endif

/* NEC LE-IT: Test and set */
#if defined(AO_HAVE_test_and_set) && \
        defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_test_and_set_release)
#       define AO_test_and_set_release(addr) \
        (AO_nop_full(), AO_test_and_set(addr))
#  define AO_HAVE_test_and_set_release
#endif

#if defined(AO_HAVE_test_and_set) && \
        defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_test_and_set_acquire)
AO_INLINE AO_TS_t
AO_test_and_set_acquire(volatile AO_TS_t *addr)
{
        AO_TS_t res = AO_test_and_set(addr);
        AO_nop_full();
        return res;
}
#  define AO_HAVE_test_and_set_acquire
#endif


/* Fetch_and_add */
/* We first try to implement fetch_and_add variants in terms    */
/* of the corresponding compare_and_swap variants to minimize   */
/* adding barriers.                                             */
#if defined(AO_HAVE_compare_and_swap_full) && \
    !defined(AO_HAVE_fetch_and_add_full)
   AO_INLINE AO_t
   AO_fetch_and_add_full(volatile AO_t *addr, AO_t incr)
   {
     AO_t old;
     do
       {
         old = *addr;
       }
     while (!AO_compare_and_swap_full(addr, old, old+incr));
     return old;
   }
#  define AO_HAVE_fetch_and_add_full
#endif

#if defined(AO_HAVE_compare_and_swap_acquire) && \
    !defined(AO_HAVE_fetch_and_add_acquire)
   AO_INLINE AO_t
   AO_fetch_and_add_acquire(volatile AO_t *addr, AO_t incr)
   {
     AO_t old;
     do
       {
         old = *addr;
       }
     while (!AO_compare_and_swap_acquire(addr, old, old+incr));
     return old;
   }
#  define AO_HAVE_fetch_and_add_acquire
#endif

#if defined(AO_HAVE_compare_and_swap_release) && \
    !defined(AO_HAVE_fetch_and_add_release)
   AO_INLINE AO_t
   AO_fetch_and_add_release(volatile AO_t *addr, AO_t incr)
   {
     AO_t old;
     do
       {
         old = *addr;
       }
     while (!AO_compare_and_swap_release(addr, old, old+incr));
     return old;
   }
#  define AO_HAVE_fetch_and_add_release
#endif

#if defined(AO_HAVE_compare_and_swap) && \
    !defined(AO_HAVE_fetch_and_add)
   AO_INLINE AO_t
   AO_fetch_and_add(volatile AO_t *addr, AO_t incr)
   {
     AO_t old;
     do
       {
         old = *addr;
       }
     while (!AO_compare_and_swap(addr, old, old+incr));
     return old;
   }
#  define AO_HAVE_fetch_and_add
#endif

#if defined(AO_HAVE_fetch_and_add_full)
#  if !defined(AO_HAVE_fetch_and_add_release)
#    define AO_fetch_and_add_release(addr, val) \
         AO_fetch_and_add_full(addr, val)
#    define AO_HAVE_fetch_and_add_release
#  endif
#  if !defined(AO_HAVE_fetch_and_add_acquire)
#    define AO_fetch_and_add_acquire(addr, val) \
         AO_fetch_and_add_full(addr, val)
#    define AO_HAVE_fetch_and_add_acquire
#  endif
#  if !defined(AO_HAVE_fetch_and_add_write)
#    define AO_fetch_and_add_write(addr, val) \
         AO_fetch_and_add_full(addr, val)
#    define AO_HAVE_fetch_and_add_write
#  endif
#  if !defined(AO_HAVE_fetch_and_add_read)
#    define AO_fetch_and_add_read(addr, val) \
         AO_fetch_and_add_full(addr, val)
#    define AO_HAVE_fetch_and_add_read
#  endif
#endif /* AO_HAVE_fetch_and_add_full */

#if !defined(AO_HAVE_fetch_and_add) && \
    defined(AO_HAVE_fetch_and_add_release)
#  define AO_fetch_and_add(addr, val) \
        AO_fetch_and_add_release(addr, val)
#  define AO_HAVE_fetch_and_add
#endif
#if !defined(AO_HAVE_fetch_and_add) && \
    defined(AO_HAVE_fetch_and_add_acquire)
#  define AO_fetch_and_add(addr, val) \
        AO_fetch_and_add_acquire(addr, val)
#  define AO_HAVE_fetch_and_add
#endif
#if !defined(AO_HAVE_fetch_and_add) && \
    defined(AO_HAVE_fetch_and_add_write)
#  define AO_fetch_and_add(addr, val) \
        AO_fetch_and_add_write(addr, val)
#  define AO_HAVE_fetch_and_add
#endif
#if !defined(AO_HAVE_fetch_and_add) && \
    defined(AO_HAVE_fetch_and_add_read)
#  define AO_fetch_and_add(addr, val) \
        AO_fetch_and_add_read(addr, val)
#  define AO_HAVE_fetch_and_add
#endif

#if defined(AO_HAVE_fetch_and_add_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_fetch_and_add_full)
#  define AO_fetch_and_add_full(addr, val) \
        (AO_nop_full(), AO_fetch_and_add_acquire(addr, val))
#  define AO_HAVE_fetch_and_add_full
#endif

#if !defined(AO_HAVE_fetch_and_add_release_write) && \
    defined(AO_HAVE_fetch_and_add_write)
#  define AO_fetch_and_add_release_write(addr, val) \
        AO_fetch_and_add_write(addr, val)
#  define AO_HAVE_fetch_and_add_release_write
#endif
#if !defined(AO_HAVE_fetch_and_add_release_write) && \
    defined(AO_HAVE_fetch_and_add_release)
#  define AO_fetch_and_add_release_write(addr, val) \
        AO_fetch_and_add_release(addr, val)
#  define AO_HAVE_fetch_and_add_release_write
#endif
#if !defined(AO_HAVE_fetch_and_add_acquire_read) && \
    defined(AO_HAVE_fetch_and_add_read)
#  define AO_fetch_and_add_acquire_read(addr, val) \
        AO_fetch_and_add_read(addr, val)
#  define AO_HAVE_fetch_and_add_acquire_read
#endif
#if !defined(AO_HAVE_fetch_and_add_acquire_read) && \
    defined(AO_HAVE_fetch_and_add_acquire)
#  define AO_fetch_and_add_acquire_read(addr, val) \
        AO_fetch_and_add_acquire(addr, val)
#  define AO_HAVE_fetch_and_add_acquire_read
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_fetch_and_add_acquire_read)
#    define AO_fetch_and_add_dd_acquire_read(addr, val) \
        AO_fetch_and_add_acquire_read(addr, val)
#    define AO_HAVE_fetch_and_add_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_fetch_and_add)
#    define AO_fetch_and_add_dd_acquire_read(addr, val) \
        AO_fetch_and_add(addr, val)
#    define AO_HAVE_fetch_and_add_dd_acquire_read
#  endif
#endif

/* Fetch_and_add1 */

#if defined(AO_HAVE_fetch_and_add_full) &&\
    !defined(AO_HAVE_fetch_and_add1_full)
#  define AO_fetch_and_add1_full(addr) AO_fetch_and_add_full(addr,1)
#  define AO_HAVE_fetch_and_add1_full
#endif
#if defined(AO_HAVE_fetch_and_add_release) &&\
    !defined(AO_HAVE_fetch_and_add1_release)
#  define AO_fetch_and_add1_release(addr) AO_fetch_and_add_release(addr,1)
#  define AO_HAVE_fetch_and_add1_release
#endif
#if defined(AO_HAVE_fetch_and_add_acquire) &&\
    !defined(AO_HAVE_fetch_and_add1_acquire)
#  define AO_fetch_and_add1_acquire(addr) AO_fetch_and_add_acquire(addr,1)
#  define AO_HAVE_fetch_and_add1_acquire
#endif
#if defined(AO_HAVE_fetch_and_add_write) &&\
    !defined(AO_HAVE_fetch_and_add1_write)
#  define AO_fetch_and_add1_write(addr) AO_fetch_and_add_write(addr,1)
#  define AO_HAVE_fetch_and_add1_write
#endif
#if defined(AO_HAVE_fetch_and_add_read) &&\
    !defined(AO_HAVE_fetch_and_add1_read)
#  define AO_fetch_and_add1_read(addr) AO_fetch_and_add_read(addr,1)
#  define AO_HAVE_fetch_and_add1_read
#endif
#if defined(AO_HAVE_fetch_and_add_release_write) &&\
    !defined(AO_HAVE_fetch_and_add1_release_write)
#  define AO_fetch_and_add1_release_write(addr) \
        AO_fetch_and_add_release_write(addr,1)
#  define AO_HAVE_fetch_and_add1_release_write
#endif
#if defined(AO_HAVE_fetch_and_add_acquire_read) &&\
    !defined(AO_HAVE_fetch_and_add1_acquire_read)
#  define AO_fetch_and_add1_acquire_read(addr) \
        AO_fetch_and_add_acquire_read(addr,1)
#  define AO_HAVE_fetch_and_add1_acquire_read
#endif
#if defined(AO_HAVE_fetch_and_add) &&\
    !defined(AO_HAVE_fetch_and_add1)
#  define AO_fetch_and_add1(addr) \
        AO_fetch_and_add(addr,1)
#  define AO_HAVE_fetch_and_add1
#endif

#if defined(AO_HAVE_fetch_and_add1_full)
#  if !defined(AO_HAVE_fetch_and_add1_release)
#    define AO_fetch_and_add1_release(addr) \
         AO_fetch_and_add1_full(addr)
#    define AO_HAVE_fetch_and_add1_release
#  endif
#  if !defined(AO_HAVE_fetch_and_add1_acquire)
#    define AO_fetch_and_add1_acquire(addr) \
         AO_fetch_and_add1_full(addr)
#    define AO_HAVE_fetch_and_add1_acquire
#  endif
#  if !defined(AO_HAVE_fetch_and_add1_write)
#    define AO_fetch_and_add1_write(addr) \
         AO_fetch_and_add1_full(addr)
#    define AO_HAVE_fetch_and_add1_write
#  endif
#  if !defined(AO_HAVE_fetch_and_add1_read)
#    define AO_fetch_and_add1_read(addr) \
         AO_fetch_and_add1_full(addr)
#    define AO_HAVE_fetch_and_add1_read
#  endif
#endif /* AO_HAVE_fetch_and_add1_full */

#if !defined(AO_HAVE_fetch_and_add1) && \
    defined(AO_HAVE_fetch_and_add1_release)
#  define AO_fetch_and_add1(addr) \
        AO_fetch_and_add1_release(addr)
#  define AO_HAVE_fetch_and_add1
#endif
#if !defined(AO_HAVE_fetch_and_add1) && \
    defined(AO_HAVE_fetch_and_add1_acquire)
#  define AO_fetch_and_add1(addr) \
        AO_fetch_and_add1_acquire(addr)
#  define AO_HAVE_fetch_and_add1
#endif
#if !defined(AO_HAVE_fetch_and_add1) && \
    defined(AO_HAVE_fetch_and_add1_write)
#  define AO_fetch_and_add1(addr) \
        AO_fetch_and_add1_write(addr)
#  define AO_HAVE_fetch_and_add1
#endif
#if !defined(AO_HAVE_fetch_and_add1) && \
    defined(AO_HAVE_fetch_and_add1_read)
#  define AO_fetch_and_add1(addr) \
        AO_fetch_and_add1_read(addr)
#  define AO_HAVE_fetch_and_add1
#endif

#if defined(AO_HAVE_fetch_and_add1_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_fetch_and_add1_full)
#  define AO_fetch_and_add1_full(addr) \
        (AO_nop_full(), AO_fetch_and_add1_acquire(addr))
#  define AO_HAVE_fetch_and_add1_full
#endif

#if !defined(AO_HAVE_fetch_and_add1_release_write) && \
    defined(AO_HAVE_fetch_and_add1_write)
#  define AO_fetch_and_add1_release_write(addr) \
        AO_fetch_and_add1_write(addr)
#  define AO_HAVE_fetch_and_add1_release_write
#endif
#if !defined(AO_HAVE_fetch_and_add1_release_write) && \
    defined(AO_HAVE_fetch_and_add1_release)
#  define AO_fetch_and_add1_release_write(addr) \
        AO_fetch_and_add1_release(addr)
#  define AO_HAVE_fetch_and_add1_release_write
#endif
#if !defined(AO_HAVE_fetch_and_add1_acquire_read) && \
    defined(AO_HAVE_fetch_and_add1_read)
#  define AO_fetch_and_add1_acquire_read(addr) \
        AO_fetch_and_add1_read(addr)
#  define AO_HAVE_fetch_and_add1_acquire_read
#endif
#if !defined(AO_HAVE_fetch_and_add1_acquire_read) && \
    defined(AO_HAVE_fetch_and_add1_acquire)
#  define AO_fetch_and_add1_acquire_read(addr) \
        AO_fetch_and_add1_acquire(addr)
#  define AO_HAVE_fetch_and_add1_acquire_read
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_fetch_and_add1_acquire_read)
#    define AO_fetch_and_add1_dd_acquire_read(addr) \
        AO_fetch_and_add1_acquire_read(addr)
#    define AO_HAVE_fetch_and_add1_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_fetch_and_add1)
#    define AO_fetch_and_add1_dd_acquire_read(addr) AO_fetch_and_add1(addr)
#    define AO_HAVE_fetch_and_add1_dd_acquire_read
#  endif
#endif

/* Fetch_and_sub1 */

#if defined(AO_HAVE_fetch_and_add_full) &&\
    !defined(AO_HAVE_fetch_and_sub1_full)
#  define AO_fetch_and_sub1_full(addr) AO_fetch_and_add_full(addr,(AO_t)(-1))
#  define AO_HAVE_fetch_and_sub1_full
#endif
#if defined(AO_HAVE_fetch_and_add_release) &&\
    !defined(AO_HAVE_fetch_and_sub1_release)
#  define AO_fetch_and_sub1_release(addr) \
        AO_fetch_and_add_release(addr,(AO_t)(-1))
#  define AO_HAVE_fetch_and_sub1_release
#endif
#if defined(AO_HAVE_fetch_and_add_acquire) &&\
    !defined(AO_HAVE_fetch_and_sub1_acquire)
#  define AO_fetch_and_sub1_acquire(addr) \
        AO_fetch_and_add_acquire(addr,(AO_t)(-1))
#  define AO_HAVE_fetch_and_sub1_acquire
#endif
#if defined(AO_HAVE_fetch_and_add_write) &&\
    !defined(AO_HAVE_fetch_and_sub1_write)
#  define AO_fetch_and_sub1_write(addr) \
        AO_fetch_and_add_write(addr,(AO_t)(-1))
#  define AO_HAVE_fetch_and_sub1_write
#endif
#if defined(AO_HAVE_fetch_and_add_read) &&\
    !defined(AO_HAVE_fetch_and_sub1_read)
#  define AO_fetch_and_sub1_read(addr) \
        AO_fetch_and_add_read(addr,(AO_t)(-1))
#  define AO_HAVE_fetch_and_sub1_read
#endif
#if defined(AO_HAVE_fetch_and_add_release_write) &&\
    !defined(AO_HAVE_fetch_and_sub1_release_write)
#  define AO_fetch_and_sub1_release_write(addr) \
        AO_fetch_and_add_release_write(addr,(AO_t)(-1))
#  define AO_HAVE_fetch_and_sub1_release_write
#endif
#if defined(AO_HAVE_fetch_and_add_acquire_read) &&\
    !defined(AO_HAVE_fetch_and_sub1_acquire_read)
#  define AO_fetch_and_sub1_acquire_read(addr) \
        AO_fetch_and_add_acquire_read(addr,(AO_t)(-1))
#  define AO_HAVE_fetch_and_sub1_acquire_read
#endif
#if defined(AO_HAVE_fetch_and_add) &&\
    !defined(AO_HAVE_fetch_and_sub1)
#  define AO_fetch_and_sub1(addr) \
        AO_fetch_and_add(addr,(AO_t)(-1))
#  define AO_HAVE_fetch_and_sub1
#endif

#if defined(AO_HAVE_fetch_and_sub1_full)
#  if !defined(AO_HAVE_fetch_and_sub1_release)
#    define AO_fetch_and_sub1_release(addr) \
         AO_fetch_and_sub1_full(addr)
#    define AO_HAVE_fetch_and_sub1_release
#  endif
#  if !defined(AO_HAVE_fetch_and_sub1_acquire)
#    define AO_fetch_and_sub1_acquire(addr) \
         AO_fetch_and_sub1_full(addr)
#    define AO_HAVE_fetch_and_sub1_acquire
#  endif
#  if !defined(AO_HAVE_fetch_and_sub1_write)
#    define AO_fetch_and_sub1_write(addr) \
         AO_fetch_and_sub1_full(addr)
#    define AO_HAVE_fetch_and_sub1_write
#  endif
#  if !defined(AO_HAVE_fetch_and_sub1_read)
#    define AO_fetch_and_sub1_read(addr) \
         AO_fetch_and_sub1_full(addr)
#    define AO_HAVE_fetch_and_sub1_read
#  endif
#endif /* AO_HAVE_fetch_and_sub1_full */

#if !defined(AO_HAVE_fetch_and_sub1) && \
    defined(AO_HAVE_fetch_and_sub1_release)
#  define AO_fetch_and_sub1(addr) \
        AO_fetch_and_sub1_release(addr)
#  define AO_HAVE_fetch_and_sub1
#endif
#if !defined(AO_HAVE_fetch_and_sub1) && \
    defined(AO_HAVE_fetch_and_sub1_acquire)
#  define AO_fetch_and_sub1(addr) \
        AO_fetch_and_sub1_acquire(addr)
#  define AO_HAVE_fetch_and_sub1
#endif
#if !defined(AO_HAVE_fetch_and_sub1) && \
    defined(AO_HAVE_fetch_and_sub1_write)
#  define AO_fetch_and_sub1(addr) \
        AO_fetch_and_sub1_write(addr)
#  define AO_HAVE_fetch_and_sub1
#endif
#if !defined(AO_HAVE_fetch_and_sub1) && \
    defined(AO_HAVE_fetch_and_sub1_read)
#  define AO_fetch_and_sub1(addr) \
        AO_fetch_and_sub1_read(addr)
#  define AO_HAVE_fetch_and_sub1
#endif

#if defined(AO_HAVE_fetch_and_sub1_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_fetch_and_sub1_full)
#  define AO_fetch_and_sub1_full(addr) \
        (AO_nop_full(), AO_fetch_and_sub1_acquire(addr))
#  define AO_HAVE_fetch_and_sub1_full
#endif

#if !defined(AO_HAVE_fetch_and_sub1_release_write) && \
    defined(AO_HAVE_fetch_and_sub1_write)
#  define AO_fetch_and_sub1_release_write(addr) \
        AO_fetch_and_sub1_write(addr)
#  define AO_HAVE_fetch_and_sub1_release_write
#endif
#if !defined(AO_HAVE_fetch_and_sub1_release_write) && \
    defined(AO_HAVE_fetch_and_sub1_release)
#  define AO_fetch_and_sub1_release_write(addr) \
        AO_fetch_and_sub1_release(addr)
#  define AO_HAVE_fetch_and_sub1_release_write
#endif
#if !defined(AO_HAVE_fetch_and_sub1_acquire_read) && \
    defined(AO_HAVE_fetch_and_sub1_read)
#  define AO_fetch_and_sub1_acquire_read(addr) \
        AO_fetch_and_sub1_read(addr)
#  define AO_HAVE_fetch_and_sub1_acquire_read
#endif
#if !defined(AO_HAVE_fetch_and_sub1_acquire_read) && \
    defined(AO_HAVE_fetch_and_sub1_acquire)
#  define AO_fetch_and_sub1_acquire_read(addr) \
        AO_fetch_and_sub1_acquire(addr)
#  define AO_HAVE_fetch_and_sub1_acquire_read
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_fetch_and_sub1_acquire_read)
#    define AO_fetch_and_sub1_dd_acquire_read(addr) \
        AO_fetch_and_sub1_acquire_read(addr)
#    define AO_HAVE_fetch_and_sub1_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_fetch_and_sub1)
#    define AO_fetch_and_sub1_dd_acquire_read(addr) AO_fetch_and_sub1(addr)
#    define AO_HAVE_fetch_and_sub1_dd_acquire_read
#  endif
#endif

/* Atomic or */
#if defined(AO_HAVE_compare_and_swap_full) && \
    !defined(AO_HAVE_or_full)
   AO_INLINE void
   AO_or_full(volatile AO_t *addr, AO_t incr)
   {
     AO_t old;
     do
       {
         old = *addr;
       }
     while (!AO_compare_and_swap_full(addr, old, (old | incr)));
   }
#  define AO_HAVE_or_full
#endif

#if defined(AO_HAVE_or_full)
#  if !defined(AO_HAVE_or_release)
#    define AO_or_release(addr, val) \
         AO_or_full(addr, val)
#    define AO_HAVE_or_release
#  endif
#  if !defined(AO_HAVE_or_acquire)
#    define AO_or_acquire(addr, val) \
         AO_or_full(addr, val)
#    define AO_HAVE_or_acquire
#  endif
#  if !defined(AO_HAVE_or_write)
#    define AO_or_write(addr, val) \
         AO_or_full(addr, val)
#    define AO_HAVE_or_write
#  endif
#  if !defined(AO_HAVE_or_read)
#    define AO_or_read(addr, val) \
         AO_or_full(addr, val)
#    define AO_HAVE_or_read
#  endif
#endif /* AO_HAVE_or_full */

#if !defined(AO_HAVE_or) && \
    defined(AO_HAVE_or_release)
#  define AO_or(addr, val) \
        AO_or_release(addr, val)
#  define AO_HAVE_or
#endif
#if !defined(AO_HAVE_or) && \
    defined(AO_HAVE_or_acquire)
#  define AO_or(addr, val) \
        AO_or_acquire(addr, val)
#  define AO_HAVE_or
#endif
#if !defined(AO_HAVE_or) && \
    defined(AO_HAVE_or_write)
#  define AO_or(addr, val) \
        AO_or_write(addr, val)
#  define AO_HAVE_or
#endif
#if !defined(AO_HAVE_or) && \
    defined(AO_HAVE_or_read)
#  define AO_or(addr, val) \
        AO_or_read(addr, val)
#  define AO_HAVE_or
#endif

#if defined(AO_HAVE_or_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_or_full)
#  define AO_or_full(addr, val) \
        (AO_nop_full(), AO_or_acquire(addr, val))
#endif

#if !defined(AO_HAVE_or_release_write) && \
    defined(AO_HAVE_or_write)
#  define AO_or_release_write(addr, val) \
        AO_or_write(addr, val)
#  define AO_HAVE_or_release_write
#endif
#if !defined(AO_HAVE_or_release_write) && \
    defined(AO_HAVE_or_release)
#  define AO_or_release_write(addr, val) \
        AO_or_release(addr, val)
#  define AO_HAVE_or_release_write
#endif
#if !defined(AO_HAVE_or_acquire_read) && \
    defined(AO_HAVE_or_read)
#  define AO_or_acquire_read(addr, val) \
        AO_or_read(addr, val)
#  define AO_HAVE_or_acquire_read
#endif
#if !defined(AO_HAVE_or_acquire_read) && \
    defined(AO_HAVE_or_acquire)
#  define AO_or_acquire_read(addr, val) \
        AO_or_acquire(addr, val)
#  define AO_HAVE_or_acquire_read
#endif

/* dd_aquire_read is meaningless.       */

/* Test_and_set */

#if defined(AO_HAVE_test_and_set_full)
#  if !defined(AO_HAVE_test_and_set_release)
#    define AO_test_and_set_release(addr) \
         AO_test_and_set_full(addr)
#    define AO_HAVE_test_and_set_release
#  endif
#  if !defined(AO_HAVE_test_and_set_acquire)
#    define AO_test_and_set_acquire(addr) \
         AO_test_and_set_full(addr)
#    define AO_HAVE_test_and_set_acquire
#  endif
#  if !defined(AO_HAVE_test_and_set_write)
#    define AO_test_and_set_write(addr) \
         AO_test_and_set_full(addr)
#    define AO_HAVE_test_and_set_write
#  endif
#  if !defined(AO_HAVE_test_and_set_read)
#    define AO_test_and_set_read(addr) \
         AO_test_and_set_full(addr)
#    define AO_HAVE_test_and_set_read
#  endif
#endif /* AO_HAVE_test_and_set_full */

#if !defined(AO_HAVE_test_and_set) && \
    defined(AO_HAVE_test_and_set_release)
#  define AO_test_and_set(addr) \
        AO_test_and_set_release(addr)
#  define AO_HAVE_test_and_set
#endif
#if !defined(AO_HAVE_test_and_set) && \
    defined(AO_HAVE_test_and_set_acquire)
#  define AO_test_and_set(addr) \
        AO_test_and_set_acquire(addr)
#  define AO_HAVE_test_and_set
#endif
#if !defined(AO_HAVE_test_and_set) && \
    defined(AO_HAVE_test_and_set_write)
#  define AO_test_and_set(addr) \
        AO_test_and_set_write(addr)
#  define AO_HAVE_test_and_set
#endif
#if !defined(AO_HAVE_test_and_set) && \
    defined(AO_HAVE_test_and_set_read)
#  define AO_test_and_set(addr) \
        AO_test_and_set_read(addr)
#  define AO_HAVE_test_and_set
#endif

#if defined(AO_HAVE_test_and_set_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_test_and_set_full)
#  define AO_test_and_set_full(addr) \
        (AO_nop_full(), AO_test_and_set_acquire(addr))
#  define AO_HAVE_test_and_set_full
#endif

#if !defined(AO_HAVE_test_and_set_release_write) && \
    defined(AO_HAVE_test_and_set_write)
#  define AO_test_and_set_release_write(addr) \
        AO_test_and_set_write(addr)
#  define AO_HAVE_test_and_set_release_write
#endif
#if !defined(AO_HAVE_test_and_set_release_write) && \
    defined(AO_HAVE_test_and_set_release)
#  define AO_test_and_set_release_write(addr) \
        AO_test_and_set_release(addr)
#  define AO_HAVE_test_and_set_release_write
#endif
#if !defined(AO_HAVE_test_and_set_acquire_read) && \
    defined(AO_HAVE_test_and_set_read)
#  define AO_test_and_set_acquire_read(addr) \
        AO_test_and_set_read(addr)
#  define AO_HAVE_test_and_set_acquire_read
#endif
#if !defined(AO_HAVE_test_and_set_acquire_read) && \
    defined(AO_HAVE_test_and_set_acquire)
#  define AO_test_and_set_acquire_read(addr) \
        AO_test_and_set_acquire(addr)
#  define AO_HAVE_test_and_set_acquire_read
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_test_and_set_acquire_read)
#    define AO_test_and_set_dd_acquire_read(addr) \
        AO_test_and_set_acquire_read(addr)
#    define AO_HAVE_test_and_set_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_test_and_set)
#    define AO_test_and_set_dd_acquire_read(addr) AO_test_and_set(addr)
#    define AO_HAVE_test_and_set_dd_acquire_read
#  endif
#endif

/* Compare_and_swap */
#if defined(AO_HAVE_compare_and_swap) && defined(AO_HAVE_nop_full)\
    && !defined(AO_HAVE_compare_and_swap_acquire)
   AO_INLINE int
   AO_compare_and_swap_acquire(volatile AO_t *addr, AO_t old, AO_t new_val)
   {
     int result = AO_compare_and_swap(addr, old, new_val);
     AO_nop_full();
     return result;
   }
#  define AO_HAVE_compare_and_swap_acquire
#endif
#if defined(AO_HAVE_compare_and_swap) && defined(AO_HAVE_nop_full)\
    && !defined(AO_HAVE_compare_and_swap_release)
#  define AO_compare_and_swap_release(addr, old, new_val) \
        (AO_nop_full(), AO_compare_and_swap(addr, old, new_val))
#  define AO_HAVE_compare_and_swap_release
#endif
#if defined(AO_HAVE_compare_and_swap_full)
#  if !defined(AO_HAVE_compare_and_swap_release)
#    define AO_compare_and_swap_release(addr, old, new_val) \
         AO_compare_and_swap_full(addr, old, new_val)
#    define AO_HAVE_compare_and_swap_release
#  endif
#  if !defined(AO_HAVE_compare_and_swap_acquire)
#    define AO_compare_and_swap_acquire(addr, old, new_val) \
         AO_compare_and_swap_full(addr, old, new_val)
#    define AO_HAVE_compare_and_swap_acquire
#  endif
#  if !defined(AO_HAVE_compare_and_swap_write)
#    define AO_compare_and_swap_write(addr, old, new_val) \
         AO_compare_and_swap_full(addr, old, new_val)
#    define AO_HAVE_compare_and_swap_write
#  endif
#  if !defined(AO_HAVE_compare_and_swap_read)
#    define AO_compare_and_swap_read(addr, old, new_val) \
         AO_compare_and_swap_full(addr, old, new_val)
#    define AO_HAVE_compare_and_swap_read
#  endif
#endif /* AO_HAVE_compare_and_swap_full */

#if !defined(AO_HAVE_compare_and_swap) && \
    defined(AO_HAVE_compare_and_swap_release)
#  define AO_compare_and_swap(addr, old, new_val) \
        AO_compare_and_swap_release(addr, old, new_val)
#  define AO_HAVE_compare_and_swap
#endif
#if !defined(AO_HAVE_compare_and_swap) && \
    defined(AO_HAVE_compare_and_swap_acquire)
#  define AO_compare_and_swap(addr, old, new_val) \
        AO_compare_and_swap_acquire(addr, old, new_val)
#  define AO_HAVE_compare_and_swap
#endif
#if !defined(AO_HAVE_compare_and_swap) && \
    defined(AO_HAVE_compare_and_swap_write)
#  define AO_compare_and_swap(addr, old, new_val) \
        AO_compare_and_swap_write(addr, old, new_val)
#  define AO_HAVE_compare_and_swap
#endif
#if !defined(AO_HAVE_compare_and_swap) && \
    defined(AO_HAVE_compare_and_swap_read)
#  define AO_compare_and_swap(addr, old, new_val) \
        AO_compare_and_swap_read(addr, old, new_val)
#  define AO_HAVE_compare_and_swap
#endif

#if defined(AO_HAVE_compare_and_swap_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_compare_and_swap_full)
#  define AO_compare_and_swap_full(addr, old, new_val) \
        (AO_nop_full(), AO_compare_and_swap_acquire(addr, old, new_val))
#  define AO_HAVE_compare_and_swap_full
#endif

#if !defined(AO_HAVE_compare_and_swap_release_write) && \
    defined(AO_HAVE_compare_and_swap_write)
#  define AO_compare_and_swap_release_write(addr, old, new_val) \
        AO_compare_and_swap_write(addr, old, new_val)
#  define AO_HAVE_compare_and_swap_release_write
#endif
#if !defined(AO_HAVE_compare_and_swap_release_write) && \
    defined(AO_HAVE_compare_and_swap_release)
#  define AO_compare_and_swap_release_write(addr, old, new_val) \
        AO_compare_and_swap_release(addr, old, new_val)
#  define AO_HAVE_compare_and_swap_release_write
#endif
#if !defined(AO_HAVE_compare_and_swap_acquire_read) && \
    defined(AO_HAVE_compare_and_swap_read)
#  define AO_compare_and_swap_acquire_read(addr, old, new_val) \
        AO_compare_and_swap_read(addr, old, new_val)
#  define AO_HAVE_compare_and_swap_acquire_read
#endif
#if !defined(AO_HAVE_compare_and_swap_acquire_read) && \
    defined(AO_HAVE_compare_and_swap_acquire)
#  define AO_compare_and_swap_acquire_read(addr, old, new_val) \
        AO_compare_and_swap_acquire(addr, old, new_val)
#  define AO_HAVE_compare_and_swap_acquire_read
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_compare_and_swap_acquire_read)
#    define AO_compare_and_swap_dd_acquire_read(addr, old, new_val) \
        AO_compare_and_swap_acquire_read(addr, old, new_val)
#    define AO_HAVE_compare_and_swap_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_compare_and_swap)
#    define AO_compare_and_swap_dd_acquire_read(addr, old, new_val) \
        AO_compare_and_swap(addr, old, new_val)
#    define AO_HAVE_compare_and_swap_dd_acquire_read
#  endif
#endif

#include "generalize-small.h"

/* Compare_double_and_swap_double */
#if defined(AO_HAVE_compare_double_and_swap_double) && defined(AO_HAVE_nop_full)\
    && !defined(AO_HAVE_compare_double_and_swap_double_acquire)
   AO_INLINE int
   AO_compare_double_and_swap_double_acquire(volatile AO_double_t *addr,
                                             AO_t o1, AO_t o2,
                                             AO_t n1, AO_t n2)
   {
     int result = AO_compare_double_and_swap_double(addr, o1, o2, n1, n2);
     AO_nop_full();
     return result;
   }
#  define AO_HAVE_compare_double_and_swap_double_acquire
#endif
#if defined(AO_HAVE_compare_double_and_swap_double) \
    && defined(AO_HAVE_nop_full)\
    && !defined(AO_HAVE_compare_double_and_swap_double_release)
#  define AO_compare_double_and_swap_double_release(addr, o1, o2, n1, n2) \
        (AO_nop_full(), AO_compare_double_and_swap_double(addr, o1, o2, n1, n2))
#  define AO_HAVE_compare_double_and_swap_double_release
#endif
#if defined(AO_HAVE_compare_double_and_swap_double_full)
#  if !defined(AO_HAVE_compare_double_and_swap_double_release)
#    define AO_compare_double_and_swap_double_release(addr, o1, o2, n1, n2) \
         AO_compare_double_and_swap_double_full(addr, o1, o2, n1, n2)
#    define AO_HAVE_compare_double_and_swap_double_release
#  endif
#  if !defined(AO_HAVE_compare_double_and_swap_double_acquire)
#    define AO_compare_double_and_swap_double_acquire(addr, o1, o2, n1, n2) \
         AO_compare_double_and_swap_double_full(addr, o1, o2, n1, n2)
#    define AO_HAVE_compare_double_and_swap_double_acquire
#  endif
#  if !defined(AO_HAVE_compare_double_and_swap_double_write)
#    define AO_compare_double_and_swap_double_write(addr, o1, o2, n1, n2) \
         AO_compare_double_and_swap_double_full(addr, o1, o2, n1, n2)
#    define AO_HAVE_compare_double_and_swap_double_write
#  endif
#  if !defined(AO_HAVE_compare_double_and_swap_double_read)
#    define AO_compare_double_and_swap_double_read(addr, o1, o2, n1, n2) \
         AO_compare_double_and_swap_double_full(addr, o1, o2, n1, n2)
#    define AO_HAVE_compare_double_and_swap_double_read
#  endif
#endif /* AO_HAVE_compare_double_and_swap_double_full */

#if !defined(AO_HAVE_compare_double_and_swap_double) && \
    defined(AO_HAVE_compare_double_and_swap_double_release)
#  define AO_compare_double_and_swap_double(addr, o1, o2, n1, n2) \
        AO_compare_double_and_swap_double_release(addr, o1, o2, n1, n2)
#  define AO_HAVE_compare_double_and_swap_double
#endif
#if !defined(AO_HAVE_compare_double_and_swap_double) && \
    defined(AO_HAVE_compare_double_and_swap_double_acquire)
#  define AO_compare_double_and_swap_double(addr, o1, o2, n1, n2) \
        AO_compare_double_and_swap_double_acquire(addr, o1, o2, n1, n2)
#  define AO_HAVE_compare_double_and_swap_double
#endif
#if !defined(AO_HAVE_compare_double_and_swap_double) && \
    defined(AO_HAVE_compare_double_and_swap_double_write)
#  define AO_compare_double_and_swap_double(addr, o1, o2, n1, n2) \
        AO_compare_double_and_swap_double_write(addr, o1, o2, n1, n2)
#  define AO_HAVE_compare_double_and_swap_double
#endif
#if !defined(AO_HAVE_compare_double_and_swap_double) && \
    defined(AO_HAVE_compare_double_and_swap_double_read)
#  define AO_compare_double_and_swap_double(addr, o1, o2, n1, n2) \
        AO_compare_double_and_swap_double_read(addr, o1, o2, n1, n2)
#  define AO_HAVE_compare_double_and_swap_double
#endif

#if defined(AO_HAVE_compare_double_and_swap_double_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_compare_double_and_swap_double_full)
#  define AO_compare_double_and_swap_double_full(addr, o1, o2, n1, n2) \
        (AO_nop_full(), AO_compare_double_and_swap_double_acquire(addr, o1, o2, n1, n2))
#  define AO_HAVE_compare_double_and_swap_double_full
#endif

#if !defined(AO_HAVE_compare_double_and_swap_double_release_write) && \
    defined(AO_HAVE_compare_double_and_swap_double_write)
#  define AO_compare_double_and_swap_double_release_write(addr, o1, o2, n1, n2) \
        AO_compare_double_and_swap_double_write(addr, o1, o2, n1, n2)
#  define AO_HAVE_compare_double_and_swap_double_release_write
#endif
#if !defined(AO_HAVE_compare_double_and_swap_double_release_write) && \
    defined(AO_HAVE_compare_double_and_swap_double_release)
#  define AO_compare_double_and_swap_double_release_write(addr, o1, o2, n1, n2) \
        AO_compare_double_and_swap_double_release(addr, o1, o2, n1, n2)
#  define AO_HAVE_compare_double_and_swap_double_release_write
#endif
#if !defined(AO_HAVE_compare_double_and_swap_double_acquire_read) && \
    defined(AO_HAVE_compare_double_and_swap_double_read)
#  define AO_compare_double_and_swap_double_acquire_read(addr, o1, o2, n1, n2) \
        AO_compare_double_and_swap_double_read(addr, o1, o2, n1, n2)
#  define AO_HAVE_compare_double_and_swap_double_acquire_read
#endif
#if !defined(AO_HAVE_compare_double_and_swap_double_acquire_read) && \
    defined(AO_HAVE_compare_double_and_swap_double_acquire)
#  define AO_compare_double_and_swap_double_acquire_read(addr, o1, o2, n1, n2) \
        AO_compare_double_and_swap_double_acquire(addr, o1, o2, n1, n2)
#  define AO_HAVE_compare_double_and_swap_double_acquire_read
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_compare_double_and_swap_double_acquire_read)
#    define AO_compare_double_and_swap_double_dd_acquire_read(addr, o1, o2, n1, n2) \
        AO_compare_double_and_swap_double_acquire_read(addr, o1, o2, n1, n2)
#    define AO_HAVE_compare_double_and_swap_double_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_compare_double_and_swap_double)
#    define AO_compare_double_and_swap_double_dd_acquire_read(addr, o1, o2, n1, n2) \
        AO_compare_double_and_swap_double(addr, o1, o2, n1, n2)
#    define AO_HAVE_compare_double_and_swap_double_dd_acquire_read
#  endif
#endif

/* Compare_and_swap_double */
#if defined(AO_HAVE_compare_and_swap_double) && defined(AO_HAVE_nop_full)\
    && !defined(AO_HAVE_compare_and_swap_double_acquire)
   AO_INLINE int
   AO_compare_and_swap_double_acquire(volatile AO_double_t *addr,
                                             AO_t o1,
                                             AO_t n1, AO_t n2)
   {
     int result = AO_compare_and_swap_double(addr, o1, n1, n2);
     AO_nop_full();
     return result;
   }
#  define AO_HAVE_compare_and_swap_double_acquire
#endif
#if defined(AO_HAVE_compare_and_swap_double) \
    && defined(AO_HAVE_nop_full)\
    && !defined(AO_HAVE_compare_and_swap_double_release)
#  define AO_compare_and_swap_double_release(addr, o1, n1, n2) \
        (AO_nop_full(), AO_compare_and_swap_double(addr, o1, n1, n2))
#  define AO_HAVE_compare_and_swap_double_release
#endif
#if defined(AO_HAVE_compare_and_swap_double_full)
#  if !defined(AO_HAVE_compare_and_swap_double_release)
#    define AO_compare_and_swap_double_release(addr, o1, n1, n2) \
         AO_compare_and_swap_double_full(addr, o1, n1, n2)
#    define AO_HAVE_compare_and_swap_double_release
#  endif
#  if !defined(AO_HAVE_compare_and_swap_double_acquire)
#    define AO_compare_and_swap_double_acquire(addr, o1, n1, n2) \
         AO_compare_and_swap_double_full(addr, o1, n1, n2)
#    define AO_HAVE_compare_and_swap_double_acquire
#  endif
#  if !defined(AO_HAVE_compare_and_swap_double_write)
#    define AO_compare_and_swap_double_write(addr, o1, n1, n2) \
         AO_compare_and_swap_double_full(addr, o1, n1, n2)
#    define AO_HAVE_compare_and_swap_double_write
#  endif
#  if !defined(AO_HAVE_compare_and_swap_double_read)
#    define AO_compare_and_swap_double_read(addr, o1, n1, n2) \
         AO_compare_and_swap_double_full(addr, o1, n1, n2)
#    define AO_HAVE_compare_and_swap_double_read
#  endif
#endif /* AO_HAVE_compare_and_swap_double_full */

#if !defined(AO_HAVE_compare_and_swap_double) && \
    defined(AO_HAVE_compare_and_swap_double_release)
#  define AO_compare_and_swap_double(addr, o1, n1, n2) \
        AO_compare_and_swap_double_release(addr, o1, n1, n2)
#  define AO_HAVE_compare_and_swap_double
#endif
#if !defined(AO_HAVE_compare_and_swap_double) && \
    defined(AO_HAVE_compare_and_swap_double_acquire)
#  define AO_compare_and_swap_double(addr, o1, n1, n2) \
        AO_compare_and_swap_double_acquire(addr, o1, n1, n2)
#  define AO_HAVE_compare_and_swap_double
#endif
#if !defined(AO_HAVE_compare_and_swap_double) && \
    defined(AO_HAVE_compare_and_swap_double_write)
#  define AO_compare_and_swap_double(addr, o1, n1, n2) \
        AO_compare_and_swap_double_write(addr, o1, n1, n2)
#  define AO_HAVE_compare_and_swap_double
#endif
#if !defined(AO_HAVE_compare_and_swap_double) && \
    defined(AO_HAVE_compare_and_swap_double_read)
#  define AO_compare_and_swap_double(addr, o1, n1, n2) \
        AO_compare_and_swap_double_read(addr, o1, n1, n2)
#  define AO_HAVE_compare_and_swap_double
#endif

#if defined(AO_HAVE_compare_and_swap_double_acquire) &&\
    defined(AO_HAVE_nop_full) && \
    !defined(AO_HAVE_compare_and_swap_double_full)
#  define AO_compare_and_swap_double_full(addr, o1, n1, n2) \
        (AO_nop_full(), AO_compare_and_swap_double_acquire(addr, o1, n1, n2))
#  define AO_HAVE_compare_and_swap_double_full
#endif

#if !defined(AO_HAVE_compare_and_swap_double_release_write) && \
    defined(AO_HAVE_compare_and_swap_double_write)
#  define AO_compare_and_swap_double_release_write(addr, o1, n1, n2) \
        AO_compare_and_swap_double_write(addr, o1, n1, n2)
#  define AO_HAVE_compare_and_swap_double_release_write
#endif
#if !defined(AO_HAVE_compare_and_swap_double_release_write) && \
    defined(AO_HAVE_compare_and_swap_double_release)
#  define AO_compare_and_swap_double_release_write(addr, o1, n1, n2) \
        AO_compare_and_swap_double_release(addr, o1, n1, n2)
#  define AO_HAVE_compare_and_swap_double_release_write
#endif
#if !defined(AO_HAVE_compare_and_swap_double_acquire_read) && \
    defined(AO_HAVE_compare_and_swap_double_read)
#  define AO_compare_and_swap_double_acquire_read(addr, o1, n1, n2) \
        AO_compare_and_swap_double_read(addr, o1, n1, n2)
#  define AO_HAVE_compare_and_swap_double_acquire_read
#endif
#if !defined(AO_HAVE_compare_and_swap_double_acquire_read) && \
    defined(AO_HAVE_compare_and_swap_double_acquire)
#  define AO_compare_and_swap_double_acquire_read(addr, o1, n1, n2) \
        AO_compare_and_swap_double_acquire(addr, o1, n1, n2)
#  define AO_HAVE_compare_and_swap_double_acquire_read
#endif

#ifdef AO_NO_DD_ORDERING
#  if defined(AO_HAVE_compare_and_swap_double_acquire_read)
#    define AO_compare_and_swap_double_dd_acquire_read(addr, o1, n1, n2) \
        AO_compare_and_swap_double_acquire_read(addr, o1, n1, n2)
#    define AO_HAVE_compare_and_swap_double_dd_acquire_read
#  endif
#else
#  if defined(AO_HAVE_compare_and_swap_double)
#    define AO_compare_and_swap_double_dd_acquire_read(addr, o1, n1, n2) \
        AO_compare_and_swap_double(addr, o1, n1, n2)
#    define AO_HAVE_compare_and_swap_double_dd_acquire_read
#  endif
#endif

/* NEC LE-IT: Convenience functions for AO_double compare and swap which */
/* types and reads easier in code                                        */
#if defined(AO_HAVE_compare_double_and_swap_double_release) && \
    !defined(AO_HAVE_double_compare_and_swap_release)
AO_INLINE int
AO_double_compare_and_swap_release(volatile AO_double_t *addr,
                                   AO_double_t old_val, AO_double_t new_val)
{
        return AO_compare_double_and_swap_double_release(addr,
                                                         old_val.AO_val1, old_val.AO_val2,
                                                         new_val.AO_val1, new_val.AO_val2);
}
#define AO_HAVE_double_compare_and_swap_release
#endif

#if defined(AO_HAVE_compare_double_and_swap_double_acquire) && \
    !defined(AO_HAVE_double_compare_and_swap_acquire)
AO_INLINE int
AO_double_compare_and_swap_acquire(volatile AO_double_t *addr,
                                   AO_double_t old_val, AO_double_t new_val)
{
        return AO_compare_double_and_swap_double_acquire(addr,
                                                         old_val.AO_val1, old_val.AO_val2,
                                                         new_val.AO_val1, new_val.AO_val2);
}
#define AO_HAVE_double_compare_and_swap_acquire
#endif

#if defined(AO_HAVE_compare_double_and_swap_double_full) && \
    !defined(AO_HAVE_double_compare_and_swap_full)
AO_INLINE int
AO_double_compare_and_swap_full(volatile AO_double_t *addr,
                                         AO_double_t old_val, AO_double_t new_val)
{
        return AO_compare_double_and_swap_double_full(addr,
                                                      old_val.AO_val1, old_val.AO_val2,
                                                      new_val.AO_val1, new_val.AO_val2);
}
#define AO_HAVE_double_compare_and_swap_full
#endif


================================================
FILE: stms/tinystm/src/atomic_ops/ia64.h
================================================
/*
 * Copyright (c) 2003 Hewlett-Packard Development Company, L.P.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "./aligned_atomic_load_store.h"

#include "./all_acquire_release_volatile.h"

#include "./test_and_set_t_is_char.h"

#ifdef _ILP32
  /* 32-bit HP/UX code. */
  /* This requires pointer "swizzling".  Pointers need to be expanded   */
  /* to 64 bits using the addp4 instruction before use.  This makes it  */
  /* hard to share code, but we try anyway.                             */
# define AO_LEN "4"
  /* We assume that addr always appears in argument position 1 in asm   */
  /* code.  If it is clobbered due to swizzling, we also need it in     */
  /* second position.  Any later arguments are referenced symbolically, */
  /* so that we don't have to worry about their position.  This requires*/
  /* gcc 3.1, but you shouldn't be using anything older than that on    */
  /* IA64 anyway.                                                       */
  /* The AO_MASK macro is a workaround for the fact that HP/UX gcc      */
  /* appears to otherwise store 64-bit pointers in ar.ccv, i.e. it      */
  /* doesn't appear to clear high bits in a pointer value we pass into  */
  /* assembly code, even if it is supposedly of type AO_t.              */
# define AO_IN_ADDR "1"(addr)
# define AO_OUT_ADDR , "=r"(addr)
# define AO_SWIZZLE "addp4 %1=0,%1;;\n"
# define AO_MASK(ptr) __asm__("zxt4 %1=%1": "=r"(ptr) : "0"(ptr));
#else
# define AO_LEN "8"
# define AO_IN_ADDR "r"(addr)
# define AO_OUT_ADDR
# define AO_SWIZZLE
# define AO_MASK(ptr)
#endif

AO_INLINE void
AO_nop_full(void)
{
  __asm__ __volatile__("mf" : : : "memory");
}
#define AO_HAVE_nop_full

AO_INLINE AO_t
AO_fetch_and_add1_acquire (volatile AO_t *addr)
{
  AO_t result;

  __asm__ __volatile__ (AO_SWIZZLE
                        "fetchadd" AO_LEN ".acq %0=[%1],1":
                        "=r" (result) AO_OUT_ADDR: AO_IN_ADDR :"memory");
  return result;
}
#define AO_HAVE_fetch_and_add1_acquire

AO_INLINE AO_t
AO_fetch_and_add1_release (volatile AO_t *addr)
{
  AO_t result;

  __asm__ __volatile__ (AO_SWIZZLE
                        "fetchadd" AO_LEN ".rel %0=[%1],1":
                        "=r" (result) AO_OUT_ADDR: AO_IN_ADDR :"memory");
  return result;
}

#define AO_HAVE_fetch_and_add1_release

AO_INLINE AO_t
AO_fetch_and_sub1_acquire (volatile AO_t *addr)
{
  AO_t result;

  __asm__ __volatile__ (AO_SWIZZLE
                        "fetchadd" AO_LEN ".acq %0=[%1],-1":
                        "=r" (result) AO_OUT_ADDR: AO_IN_ADDR :"memory");
  return result;
}

#define AO_HAVE_fetch_and_sub1_acquire

AO_INLINE AO_t
AO_fetch_and_sub1_release (volatile AO_t *addr)
{
  AO_t result;

  __asm__ __volatile__ (AO_SWIZZLE
                        "fetchadd" AO_LEN ".rel %0=[%1],-1":
                        "=r" (result) AO_OUT_ADDR: AO_IN_ADDR :"memory");
  return result;
}

#define AO_HAVE_fetch_and_sub1_release

#ifndef _ILP32

AO_INLINE unsigned int
AO_int_fetch_and_add1_acquire (volatile unsigned int *addr)
{
  unsigned int result;

  __asm__ __volatile__ ("fetchadd4.acq %0=[%1],1":
                        "=r" (result): AO_IN_ADDR :"memory");
  return result;
}
#define AO_HAVE_int_fetch_and_add1_acquire

AO_INLINE unsigned int
AO_int_fetch_and_add1_release (volatile unsigned int *addr)
{
  unsigned int result;

  __asm__ __volatile__ ("fetchadd4.rel %0=[%1],1":
                        "=r" (result): AO_IN_ADDR :"memory");
  return result;
}

#define AO_HAVE_int_fetch_and_add1_release

AO_INLINE unsigned int
AO_int_fetch_and_sub1_acquire (volatile unsigned int *addr)
{
  unsigned int result;

  __asm__ __volatile__ ("fetchadd4.acq %0=[%1],-1":
                        "=r" (result): AO_IN_ADDR :"memory");
  return result;
}

#define AO_HAVE_int_fetch_and_sub1_acquire

AO_INLINE unsigned int
AO_int_fetch_and_sub1_release (volatile unsigned int *addr)
{
  unsigned int result;

  __asm__ __volatile__ ("fetchadd4.rel %0=[%1],-1":
                        "=r" (result): AO_IN_ADDR :"memory");
  return result;
}

#define AO_HAVE_int_fetch_and_sub1_release

#endif /* !_ILP32 */

AO_INLINE int
AO_compare_and_swap_acquire(volatile AO_t *addr,
                             AO_t old, AO_t new_val)
{
  AO_t oldval;
  AO_MASK(old);
  __asm__ __volatile__(AO_SWIZZLE
                       "mov ar.ccv=%[old] ;; cmpxchg" AO_LEN
                       ".acq %0=[%1],%[new_val],ar.ccv"
                       : "=r"(oldval) AO_OUT_ADDR
                       : AO_IN_ADDR, [new_val]"r"(new_val), [old]"r"(old)
                       : "memory");
  return (oldval == old);
}

#define AO_HAVE_compare_and_swap_acquire

AO_INLINE int
AO_compare_and_swap_release(volatile AO_t *addr,
                             AO_t old, AO_t new_val)
{
  AO_t oldval;
  AO_MASK(old);
  __asm__ __volatile__(AO_SWIZZLE
                       "mov ar.ccv=%[old] ;; cmpxchg" AO_LEN
                       ".rel %0=[%1],%[new_val],ar.ccv"
                       : "=r"(oldval) AO_OUT_ADDR
                       : AO_IN_ADDR, [new_val]"r"(new_val), [old]"r"(old)
                       : "memory");
  return (oldval == old);
}

#define AO_HAVE_compare_and_swap_release

AO_INLINE int
AO_char_compare_and_swap_acquire(volatile unsigned char *addr,
                                 unsigned char old, unsigned char new_val)
{
  unsigned char oldval;
  __asm__ __volatile__(AO_SWIZZLE
               "mov ar.ccv=%[old] ;; cmpxchg1.acq %0=[%1],%[new_val],ar.ccv"
               : "=r"(oldval) AO_OUT_ADDR
               : AO_IN_ADDR, [new_val]"r"(new_val), [old]"r"((AO_t)old)
               : "memory");
  return (oldval == old);
}

#define AO_HAVE_char_compare_and_swap_acquire

AO_INLINE int
AO_char_compare_and_swap_release(volatile unsigned char *addr,
                                 unsigned char old, unsigned char new_val)
{
  unsigned char oldval;
  __asm__ __volatile__(AO_SWIZZLE
                "mov ar.ccv=%[old] ;; cmpxchg1.rel %0=[%1],%[new_val],ar.ccv"
                : "=r"(oldval) AO_OUT_ADDR
                : AO_IN_ADDR, [new_val]"r"(new_val), [old]"r"((AO_t)old)
                : "memory");
  return (oldval == old);
}

#define AO_HAVE_char_compare_and_swap_release

AO_INLINE int
AO_short_compare_and_swap_acquire(volatile unsigned short *addr,
                                  unsigned short old, unsigned short new_val)
{
  unsigned short oldval;
  __asm__ __volatile__(AO_SWIZZLE
                "mov ar.ccv=%[old] ;; cmpxchg2.acq %0=[%1],%[new_val],ar.ccv"
                : "=r"(oldval) AO_OUT_ADDR
                : AO_IN_ADDR, [new_val]"r"(new_val), [old]"r"((AO_t)old)
                : "memory");
  return (oldval == old);
}

#define AO_HAVE_short_compare_and_swap_acquire

AO_INLINE int
AO_short_compare_and_swap_release(volatile unsigned short *addr,
                                  unsigned short old, unsigned short new_val)
{
  unsigned short oldval;
  __asm__ __volatile__(AO_SWIZZLE
                "mov ar.ccv=%[old] ;; cmpxchg2.rel %0=[%1],%[new_val],ar.ccv"
                : "=r"(oldval) AO_OUT_ADDR
                : AO_IN_ADDR, [new_val]"r"(new_val), [old]"r"((AO_t)old)
                : "memory");
  return (oldval == old);
}

#define AO_HAVE_short_compare_and_swap_release

#ifndef _ILP32

AO_INLINE int
AO_int_compare_and_swap_acquire(volatile unsigned int *addr,
                                unsigned int old, unsigned int new_val)
{
  unsigned int oldval;
  __asm__ __volatile__("mov ar.ccv=%3 ;; cmpxchg4.acq %0=[%1],%2,ar.ccv"
                       : "=r"(oldval)
                       : AO_IN_ADDR, "r"(new_val), "r"((AO_t)old) : "memory");
  return (oldval == old);
}

#define AO_HAVE_int_compare_and_swap_acquire

AO_INLINE int
AO_int_compare_and_swap_release(volatile unsigned int *addr,
                                unsigned int old, unsigned int new_val)
{
  unsigned int oldval;
  __asm__ __volatile__("mov ar.ccv=%3 ;; cmpxchg4.rel %0=[%1],%2,ar.ccv"
                       : "=r"(oldval)
                       : AO_IN_ADDR, "r"(new_val), "r"((AO_t)old) : "memory");
  return (oldval == old);
}

#define AO_HAVE_int_compare_and_swap_release

#endif /* !_ILP32 */

/* FIXME: Add compare_and_swap_double as soon as there is widely        */
/* available hardware that implements it.                               */

/* FIXME: Add compare_double_and_swap_double for the _ILP32 case.       */

#ifdef _ILP32
# include "./ao_t_is_int.h"
#endif


================================================
FILE: stms/tinystm/src/atomic_ops/ordered_except_wr.h
================================================
/*
 * Copyright (c) 2003 by Hewlett-Packard Company.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

/*
 * These are common definitions for architectures that provide processor
 * ordered memory operations except that a later read may pass an
 * earlier write.  Real x86 implementations seem to be in this category,
 * except apparently for some IDT WinChips, which we ignore.
 */

#include "read_ordered.h"

AO_INLINE void
AO_nop_write(void)
{
  AO_compiler_barrier();
  /* sfence according to Intel docs.  Pentium 3 and up. */
  /* Unnecessary for cached accesses?                   */
}

#define AO_HAVE_NOP_WRITE

#if defined(AO_HAVE_store)

AO_INLINE void
AO_store_write(volatile AO_t *addr, AO_t val)
{
  AO_compiler_barrier();
  AO_store(addr, val);
}
# define AO_HAVE_store_write

# define AO_store_release(addr, val) AO_store_write(addr, val)
# define AO_HAVE_store_release

#endif /* AO_HAVE_store */

#if defined(AO_HAVE_char_store)

AO_INLINE void
AO_char_store_write(volatile unsigned char *addr, unsigned char val)
{
  AO_compiler_barrier();
  AO_char_store(addr, val);
}
# define AO_HAVE_char_store_write

# define AO_char_store_release(addr, val) AO_char_store_write(addr, val)
# define AO_HAVE_char_store_release

#endif /* AO_HAVE_char_store */

#if defined(AO_HAVE_short_store)

AO_INLINE void
AO_short_store_write(volatile unsigned short *addr, unsigned short val)
{
  AO_compiler_barrier();
  AO_short_store(addr, val);
}
# define AO_HAVE_short_store_write

# define AO_short_store_release(addr, val) AO_short_store_write(addr, val)
# define AO_HAVE_short_store_release

#endif /* AO_HAVE_short_store */

#if defined(AO_HAVE_int_store)

AO_INLINE void
AO_int_store_write(volatile unsigned int *addr, unsigned int val)
{
  AO_compiler_barrier();
  AO_int_store(addr, val);
}
# define AO_HAVE_int_store_write

# define AO_int_store_release(addr, val) AO_int_store_write(addr, val)
# define AO_HAVE_int_store_release

#endif /* AO_HAVE_int_store */


================================================
FILE: stms/tinystm/src/atomic_ops/powerpc.h
================================================
/*
 * Copyright (c) 1991-1994 by Xerox Corporation.  All rights reserved.
 * Copyright (c) 1996-1999 by Silicon Graphics.  All rights reserved.
 * Copyright (c) 1999-2004 Hewlett-Packard Development Company, L.P.
 *
 *
 * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
 * OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
 *
 * Permission is hereby granted to use or copy this program
 * for any purpose,  provided the above notices are retained on all copies.
 * Permission to modify the code and to distribute modified code is granted,
 * provided the above notices are retained, and a notice that the code was
 * modified is included with the above copyright notice.
 *
 */

/* Memory model documented at http://www-106.ibm.com/developerworks/    */
/* eserver/articles/archguide.html and (clearer)                        */
/* http://www-106.ibm.com/developerworks/eserver/articles/powerpc.html. */
/* There appears to be no implicit ordering between any kind of         */
/* independent memory references.                                       */
/* Architecture enforces some ordering based on control dependence.     */
/* I don't know if that could help.                                     */
/* Data-dependent loads are always ordered.                             */
/* Based on the above references, eieio is intended for use on          */
/* uncached memory, which we don't support.  It does not order loads    */
/* from cached memory.                                                  */
/* Thanks to Maged Michael, Doug Lea, and Roger Hoover for helping to   */
/* track some of this down and correcting my misunderstandings. -HB     */
/* Earl Chew subsequently contributed further fixes & additions.        */

#include "./aligned_atomic_load_store.h"

#include "./test_and_set_t_is_ao_t.h"
        /* There seems to be no byte equivalent of lwarx, so this       */
        /* may really be what we want, at least in the 32-bit case.     */

AO_INLINE void
AO_nop_full(void)
{
  __asm__ __volatile__("sync" : : : "memory");
}

#define AO_HAVE_nop_full

/* lwsync apparently works for everything but a StoreLoad barrier.      */
AO_INLINE void
AO_lwsync(void)
{
#ifdef __NO_LWSYNC__
  __asm__ __volatile__("sync" : : : "memory");
#else
  __asm__ __volatile__("lwsync" : : : "memory");
#endif
}

#define AO_nop_write() AO_lwsync()
#define AO_HAVE_nop_write

#define AO_nop_read() AO_lwsync()
#define AO_HAVE_nop_read

/* We explicitly specify load_acquire, since it is important, and can   */
/* be implemented relatively cheaply.  It could be implemented          */
/* with an ordinary load followed by a lwsync.  But the general wisdom  */
/* seems to be that a data dependent branch followed by an isync is     */
/* cheaper.  And the documentation is fairly explicit that this also    */
/* has acquire semantics.                                               */
/* ppc64 uses ld not lwz */
#if defined(__powerpc64__) || defined(__ppc64__) || defined(__64BIT__)
AO_INLINE AO_t
AO_load_acquire(const volatile AO_t *addr)
{
  AO_t result;

   __asm__ __volatile__ (
    "ld%U1%X1 %0,%1\n"
    "cmpw %0,%0\n"
    "bne- 1f\n"
    "1: isync\n"
    : "=r" (result)
    : "m"(*addr) : "memory", "cr0");
  return result;
}
#else
AO_INLINE AO_t
AO_load_acquire(const volatile AO_t *addr)
{
  AO_t result;

  /* FIXME: We should get gcc to allocate one of the condition  */
  /* registers.  I always got "impossible constraint" when I    */
  /* tried the "y" constraint.                                  */
  __asm__ __volatile__ (
    "lwz%U1%X1 %0,%1\n"
    "cmpw %0,%0\n"
    "bne- 1f\n"
    "1: isync\n"
    : "=r" (result)
    : "m"(*addr) : "memory", "cc");
  return result;
}
#endif
#define AO_HAVE_load_acquire

/* We explicitly specify store_release, since it relies         */
/* on the fact that lwsync is also a LoadStore barrier.         */
AO_INLINE void
AO_store_release(volatile AO_t *addr, AO_t value)
{
  AO_lwsync();
  *addr = value;
}

#define AO_HAVE_load_acquire

/* This is similar to the code in the garbage collector.  Deleting      */
/* this and having it synthesized from compare_and_swap would probably  */
/* only cost us a load immediate instruction.                           */
#if defined(__powerpc64__) || defined(__ppc64__) || defined(__64BIT__)
/* Completely untested.  And we should be using smaller objects anyway. */
AO_INLINE AO_TS_VAL_t
AO_test_and_set(volatile AO_TS_t *addr) {
  unsigned long oldval;
  unsigned long temp = 1; /* locked value */

  __asm__ __volatile__(
               "1:ldarx %0,0,%1\n"   /* load and reserve               */
               "cmpdi %0, 0\n"       /* if load is                     */
               "bne 2f\n"            /*   non-zero, return already set */
               "stdcx. %2,0,%1\n"    /* else store conditional         */
               "bne- 1b\n"           /* retry if lost reservation      */
               "2:\n"                /* oldval is zero if we set       */
              : "=&r"(oldval)
              : "r"(addr), "r"(temp)
              : "memory", "cr0");

  return (AO_TS_VAL_t)oldval;
}

#else

AO_INLINE AO_TS_VAL_t
AO_test_and_set(volatile AO_TS_t *addr) {
  int oldval;
  int temp = 1; /* locked value */

  __asm__ __volatile__(
               "1:lwarx %0,0,%1\n"   /* load and reserve               */
               "cmpwi %0, 0\n"       /* if load is                     */
               "bne 2f\n"            /*   non-zero, return already set */
               "stwcx. %2,0,%1\n"    /* else store conditional         */
               "bne- 1b\n"           /* retry if lost reservation      */
               "2:\n"                /* oldval is zero if we set       */
              : "=&r"(oldval)
              : "r"(addr), "r"(temp)
              : "memory", "cr0");

  return (AO_TS_VAL_t)oldval;
}

#endif

#define AO_HAVE_test_and_set

AO_INLINE AO_TS_VAL_t
AO_test_and_set_acquire(volatile AO_TS_t *addr) {
  AO_TS_VAL_t result = AO_test_and_set(addr);
  AO_lwsync();
  return result;
}

#define AO_HAVE_test_and_set_acquire

AO_INLINE AO_TS_VAL_t
AO_test_and_set_release(volatile AO_TS_t *addr) {
  AO_lwsync();
  return AO_test_and_set(addr);
}

#define AO_HAVE_test_and_set_release

AO_INLINE AO_TS_VAL_t
AO_test_and_set_full(volatile AO_TS_t *addr) {
  AO_TS_VAL_t result;
  AO_lwsync();
  result = AO_test_and_set(addr);
  AO_lwsync();
  return result;
}

#define AO_HAVE_test_and_set_full

#if defined(__powerpc64__) || defined(__ppc64__) || defined(__64BIT__)
/* FIXME: Completely untested.  */
AO_INLINE int
AO_compare_and_swap(volatile AO_t *addr, AO_t old, AO_t new_val) {
  AO_t oldval;
  int result = 0;

  __asm__ __volatile__(
               "1:ldarx %0,0,%2\n"   /* load and reserve              */
               "cmpd %0, %4\n"      /* if load is not equal to  */
               "bne 2f\n"            /*   old, fail                     */
               "stdcx. %3,0,%2\n"    /* else store conditional         */
               "bne- 1b\n"           /* retry if lost reservation      */
               "li %1,1\n"           /* result = 1;                     */
               "2:\n"
              : "=&r"(oldval), "=&r"(result)
              : "r"(addr), "r"(new_val), "r"(old), "1"(result)
              : "memory", "cr0");

  return result;
}

#else

AO_INLINE int
AO_compare_and_swap(volatile AO_t *addr, AO_t old, AO_t new_val) {
  AO_t oldval;
  int result = 0;

  __asm__ __volatile__(
               "1:lwarx %0,0,%2\n"   /* load and reserve              */
               "cmpw %0, %4\n"      /* if load is not equal to  */
               "bne 2f\n"            /*   old, fail                     */
               "stwcx. %3,0,%2\n"    /* else store conditional         */
               "bne- 1b\n"           /* retry if lost reservation      */
               "li %1,1\n"           /* result = 1;                     */
               "2:\n"
              : "=&r"(oldval), "=&r"(result)
              : "r"(addr), "r"(new_val), "r"(old), "1"(result)
              : "memory", "cr0");

  return result;
}
#endif

#define AO_HAVE_compare_and_swap

AO_INLINE int
AO_compare_and_swap_acquire(volatile AO_t *addr, AO_t old, AO_t new_val) {
  int result = AO_compare_and_swap(addr, old, new_val);
  AO_lwsync();
  return result;
}

#define AO_HAVE_compare_and_swap_acquire

AO_INLINE int
AO_compare_and_swap_release(volatile AO_t *addr, AO_t old, AO_t new_val) {
  AO_lwsync();
  return AO_compare_and_swap(addr, old, new_val);
}

#define AO_HAVE_compare_and_swap_release

AO_INLINE int
AO_compare_and_swap_full(volatile AO_t *addr, AO_t old, AO_t new_val) {
  AO_t result;
  AO_lwsync();
  result = AO_compare_and_swap(addr, old, new_val);
  AO_lwsync();
  return result;
}

#define AO_HAVE_compare_and_swap_full

#if defined(__powerpc64__) || defined(__ppc64__) || defined(__64BIT__)
/* FIXME: Completely untested.                                          */

AO_INLINE AO_t
AO_fetch_and_add(volatile AO_t *addr, AO_t incr) {
  AO_t oldval;
  AO_t newval;

  __asm__ __volatile__(
               "1:ldarx %0,0,%2\n"   /* load and reserve                */
               "add %1,%0,%3\n"      /* increment                       */
               "stdcx. %1,0,%2\n"    /* store conditional               */
               "bne- 1b\n"           /* retry if lost reservation       */
              : "=&r"(oldval), "=&r"(newval)
               : "r"(addr), "r"(incr)
              : "memory", "cr0");

  return oldval;
}

#define AO_HAVE_fetch_and_add

#else

AO_INLINE AO_t
AO_fetch_and_add(volatile AO_t *addr, AO_t incr) {
  AO_t oldval;
  AO_t newval;

  __asm__ __volatile__(
               "1:lwarx %0,0,%2\n"   /* load and reserve                */
               "add %1,%0,%3\n"      /* increment                       */
               "stwcx. %1,0,%2\n"    /* store conditional               */
               "bne- 1b\n"           /* retry if lost reservation       */
              : "=&r"(oldval), "=&r"(newval)
               : "r"(addr), "r"(incr)
              : "memory", "cr0");

  return oldval;
}

#define AO_HAVE_fetch_and_add

#endif

AO_INLINE AO_t
AO_fetch_and_add_acquire(volatile AO_t *addr, AO_t incr) {
  AO_t result = AO_fetch_and_add(addr, incr);
  AO_lwsync();
  return result;
}

#define AO_HAVE_fetch_and_add_acquire

AO_INLINE AO_t
AO_fetch_and_add_release(volatile AO_t *addr, AO_t incr) {
  AO_lwsync();
  return AO_fetch_and_add(addr, incr);
}

#define AO_HAVE_fetch_and_add_release

AO_INLINE AO_t
AO_fetch_and_add_full(volatile AO_t *addr, AO_t incr) {
  AO_t result;
  AO_lwsync();
  result = AO_fetch_and_add(addr, incr);
  AO_lwsync();
  return result;
}

#define AO_HAVE_fetch_and_add_full

#if defined(__powerpc64__) || defined(__ppc64__) || defined(__64BIT__)
#else
# include "./ao_t_is_int.h"
#endif


================================================
FILE: stms/tinystm/src/atomic_ops/read_ordered.h
================================================
/*
 * Copyright (c) 2003 by Hewlett-Packard Company.  All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

/*
 * These are common definitions for architectures that provide processor
 * ordered memory operations except that a later read may pass an
 * earlier write.  Real x86 implementations seem to be in this category,
 * except apparently for some IDT WinChips, which we ignore.
 */

AO_INLINE void
AO_nop_read(void)
{
  AO_compiler_barrier();
}

#define AO_HAVE_NOP_READ

#ifdef AO_HAVE_load

AO_INLINE AO_t
AO_load_read(const volatile AO_t *addr)
{
  AO_t result = AO_load(addr);
  AO_compiler_barrier();
  return result;
}
#define AO_HAVE_load_read

#define AO_load_acquire(addr) AO_load_read(addr)
#define AO_HAVE_load_acquire

#endif /* AO_HAVE_load */

#ifdef AO_HAVE_char_load

AO_INLINE AO_t
AO_char_load_read(const volatile unsigned char *addr)
{
  AO_t result = AO_char_load(addr);
  AO_compiler_barrier();
  return result;
}
#define AO_HAVE_char_load_read

#define AO_char_load_acquire(addr) AO_char_load_read(addr)
#define AO_HAVE_char_load_acquire

#endif /* AO_HAVE_char_load */

#ifdef AO_HAVE_short_load

AO_INLINE AO_t
AO_short_load_read(const volatile unsigned short *addr)
{
  AO_t result = AO_short_load(addr);
  AO_compiler_barrier();
  return result;
}
#define AO_HAVE_short_load_read

#define AO_short_load_acquire(addr) AO_short_load_read(addr)
#define AO_HAVE_short_load_acquire

#endif /* AO_HAVE_short_load */

#ifdef AO_HAVE_int_load

AO_INLINE AO_t
AO_int_load_read(const volatile unsigned int *addr)
{
  AO_t result = AO_int_load(addr);
  AO_compiler_barrier();
  return result;
}
#define AO_HAVE_int_load_read

#define AO_int_load_acquire(addr) AO_int_load_read(addr)
#define AO_HAVE_int_load_acquire

#endif /* AO_HAVE_int_load */


================================================
FILE: stms/tinystm/src/atomic_ops/sparc.h
================================================
/*
 * Copyright (c) 1991-1994 by Xerox Corporation.  All rights reserved.
 * Copyright (c) 1996-1999 by Silicon Graphics.  All rights reserved.
 * Copyright (c) 1999-2003 by Hewlett-Packard Company. All rights reserved.
 *
 *
 * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
 * OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
 *
 * Permission is hereby granted to use or copy this program
 * for any purpose,  provided the above notices are retained on all copies.
 * Permission to modify the code and to distribute modified code is granted,
 * provided the above notices are retained, and a notice that the code was
 * modified is included with the above copyright notice.
 *
 */

/* FIXME.  Very incomplete.  No support for sparc64.    */
/* Non-ancient SPARCs provide compare-and-swap (casa).  */
/* We should make that available.                       */

#include "./aligned_atomic_load_store.h"

/* Real SPARC code uses TSO:                            */
#include "./ordered_except_wr.h"

/* Test_and_set location is just a byte.                */
#include "./test_and_set_t_is_char.h"

AO_INLINE AO_TS_VAL_t
AO_test_and_set_full(volatile AO_TS_t *addr) {
   AO_TS_VAL_t oldval;

   __asm__ __volatile__("ldstub %1,%0"
                        : "=r"(oldval), "=m"(*addr)
                        : "m"(*addr) : "memory");
   return oldval;
}

#define AO_HAVE_test_and_set_full

#ifndef AO_NO_SPARC_V9
/* Returns nonzero if the comparison succeeded. */
AO_INLINE int
AO_compare_and_swap_full(volatile AO_t *addr, AO_t old, AO_t new_val) {
  char ret;
  __asm__ __volatile__ ("membar #StoreLoad | #LoadLoad\n\t"
#                       if defined(__arch64__)
                          "casx [%2],%0,%1\n\t"
#                       else
                          "cas [%2],%0,%1\n\t" /* 32-bit version */
#                       endif
                        "membar #StoreLoad | #StoreStore\n\t"
                        "cmp %0,%1\n\t"
                        "be,a 0f\n\t"
                        "mov 1,%0\n\t"/* one insn after branch always executed */
                        "clr %0\n\t"
                        "0:\n\t"
                        : "=r" (ret), "+r" (new_val)
                        : "r" (addr), "0" (old)
                        : "memory", "cc");
  return (int)ret;
}

#define AO_HAVE_compare_and_swap_full
#endif /* AO_NO_SPARC_V9 */

/* FIXME: This needs to be extended for SPARC v8 and v9.        */
/* SPARC V8 also has swap.  V9 has CAS.                         */
/* There are barriers like membar #LoadStore.                   */
/* CASA (32-bit) and CASXA(64-bit) instructions were            */
/* added in V9.                                                 */


================================================
FILE: stms/tinystm/src/atomic_ops/standard_ao_double_t.h
================================================
/* NEC LE-IT: For 64Bit OS we extend the double type to hold two int64's
*
*  x86-64: __m128 serves as placeholder which also requires the compiler
*          to align     it on 16 byte boundary (as required by cmpxchg16.
* Similar things could be done for PowerPC 64bit using a VMX data type...       */

#if (defined(__x86_64__) && defined(__GNUC__)) || defined(_WIN64)
# include <xmmintrin.h>
  typedef __m128 double_ptr_storage;
#elif defined(_WIN32) && !defined(__GNUC__)
  typedef unsigned __int64 double_ptr_storage;
#else
  typedef unsigned long long double_ptr_storage;
#endif

# define AO_HAVE_DOUBLE_PTR_STORAGE

typedef union {
    double_ptr_storage AO_whole;
    struct {AO_t AO_v1; AO_t AO_v2;} AO_parts;
} AO_double_t;

#define AO_HAVE_double_t
#define AO_val1 AO_parts.AO_v1
#define AO_val2 AO_parts.AO_v2


================================================
FILE: stms/tinystm/src/atomic_ops/test_and_set_t_is_ao_t.h
================================================
/*
 * Copyright (c) 2004 Hewlett-Packard Development Company, L.P.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

/*
 * These are common definitions for architectures on which test_and_set
 * operates on pointer-sized quantities, the "clear" value contains
 * all zeroes, and the "set" value contains only one lowest bit set.
 * This can be used if test_and_set is synthesized from compare_and_swap.
 */
typedef enum {AO_TS_clear = 0, AO_TS_set = 1} AO_TS_val;
#define AO_TS_VAL_t AO_TS_val
#define AO_TS_CLEAR AO_TS_clear
#define AO_TS_SET AO_TS_set

#define AO_TS_t AO_t

#define AO_AO_TS_T 1


================================================
FILE: stms/tinystm/src/atomic_ops/test_and_set_t_is_char.h
================================================
/*
 * Copyright (c) 2004 Hewlett-Packard Development Company, L.P.
 * 
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE. 
 */ 

/*
 * These are common definitions for architectures on which test_and_set
 * operates on byte sized quantities, the "clear" value contains
 * all zeroes, and the "set" value contains all ones.
 */

#define AO_TS_t unsigned char
typedef enum {AO_BYTE_TS_clear = 0, AO_BYTE_TS_set = 0xff} AO_BYTE_TS_val;
#define AO_TS_VAL_t AO_BYTE_TS_val
#define AO_TS_CLEAR AO_BYTE_TS_clear
#define AO_TS_SET AO_BYTE_TS_set

#define AO_CHAR_TS_T 1


================================================
FILE: stms/tinystm/src/atomic_ops/x86.h
================================================
/*
 * Copyright (c) 1991-1994 by Xerox Corporation.  All rights reserved.
 * Copyright (c) 1996-1999 by Silicon Graphics.  All rights reserved.
 * Copyright (c) 1999-2003 by Hewlett-Packard Company. All rights reserved.
 *
 *
 * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
 * OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
 *
 * Permission is hereby granted to use or copy this program
 * for any purpose,  provided the above notices are retained on all copies.
 * Permission to modify the code and to distribute modified code is granted,
 * provided the above notices are retained, and a notice that the code was
 * modified is included with the above copyright notice.
 *
 * Some of the machine specific code was borrowed from our GC distribution.
 */

/* The following really assume we have a 486 or better.  Unfortunately  */
/* gcc doesn't define a suitable feature test macro based on command    */
/* line options.                                                        */
/* We should perhaps test dynamically.                                  */

#include "./aligned_atomic_load_store.h"

/* Real X86 implementations, except for some old WinChips, appear       */
/* to enforce ordering between memory operations, EXCEPT that a later   */
/* read can pass earlier writes, presumably due to the visible          */
/* presence of store buffers.                                           */
/* We ignore both the WinChips, and the fact that the official specs    */
/* seem to be much weaker (and arguably too weak to be usable).         */

#include "./ordered_except_wr.h"

#include "./test_and_set_t_is_char.h"

#include "./standard_ao_double_t.h"

#if defined(AO_USE_PENTIUM4_INSTRS)
AO_INLINE void
AO_nop_full(void)
{
  __asm__ __volatile__("mfence" : : : "memory");
}

#define AO_HAVE_nop_full

#else

/* We could use the cpuid instruction.  But that seems to be slower     */
/* than the default implementation based on test_and_set_full.  Thus    */
/* we omit that bit of misinformation here.                             */

#endif

/* As far as we can tell, the lfence and sfence instructions are not    */
/* currently needed or useful for cached memory accesses.               */

/* Really only works for 486 and later */
AO_INLINE AO_t
AO_fetch_and_add_full (volatile AO_t *p, AO_t incr)
{
  AO_t result;

  __asm__ __volatile__ ("lock; xaddl %0, %1" :
                        "=r" (result), "=m" (*p) : "0" (incr), "m" (*p)
                        : "memory");
  return result;
}

#define AO_HAVE_fetch_and_add_full

AO_INLINE unsigned char
AO_char_fetch_and_add_full (volatile unsigned char *p, unsigned char incr)
{
  unsigned char result;

  __asm__ __volatile__ ("lock; xaddb %0, %1" :
                        "=q" (result), "=m" (*p) : "0" (incr), "m" (*p)
                        : "memory");
  return result;
}

#define AO_HAVE_char_fetch_and_add_full

AO_INLINE unsigned short
AO_short_fetch_and_add_full (volatile unsigned short *p, unsigned short incr)
{
  unsigned short result;

  __asm__ __volatile__ ("lock; xaddw %0, %1" :
                        "=r" (result), "=m" (*p) : "0" (incr), "m" (*p)
                        : "memory");
  return result;
}

#define AO_HAVE_short_fetch_and_add_full

/* Really only works for 486 and later */
AO_INLINE void
AO_or_full (volatile AO_t *p, AO_t incr)
{
  __asm__ __volatile__ ("lock; orl %1, %0" :
                        "=m" (*p) : "r" (incr), "m" (*p) : "memory");
}

#define AO_HAVE_or_full

AO_INLINE AO_TS_VAL_t
AO_test_and_set_full(volatile AO_TS_t *addr)
{
  unsigned char oldval;
  /* Note: the "xchg" instruction does not need a "lock" prefix */
  __asm__ __volatile__("xchgb %0, %1"
                : "=q"(oldval), "=m"(*addr)
                : "0"((unsigned char)0xff), "m"(*addr) : "memory");
  return (AO_TS_VAL_t)oldval;
}

#define AO_HAVE_test_and_set_full

/* Returns nonzero if the comparison succeeded. */
AO_INLINE int
AO_compare_and_swap_full(volatile AO_t *addr, AO_t old, AO_t new_val)
{
# ifdef AO_USE_SYNC_CAS_BUILTIN
    return (int)__sync_bool_compare_and_swap(addr, old, new_val);
# else
    char result;
    __asm__ __volatile__("lock; cmpxchgl %3, %0; setz %1"
                         : "=m" (*addr), "=a" (result)
                         : "m" (*addr), "r" (new_val), "a" (old) : "memory");
    return (int)result;
# endif
}

#define AO_HAVE_compare_and_swap_full

/* Returns nonzero if the comparison succeeded. */
/* Really requires at least a Pentium.          */
AO_INLINE int
AO_compare_double_and_swap_double_full(volatile AO_double_t *addr,
                                       AO_t old_val1, AO_t old_val2,
                                       AO_t new_val1, AO_t new_val2)
{
  char result;
#if __PIC__
  /* If PIC is turned on, we can't use %ebx as it is reserved for the
     GOT pointer.  We can save and restore %ebx because GCC won't be
     using it for anything else (such as any of the m operands) */
  __asm__ __volatile__("pushl %%ebx;"   /* save ebx used for PIC GOT ptr */
                       "movl %6,%%ebx;" /* move new_val2 to %ebx */
                       "lock; cmpxchg8b %0; setz %1;"
                       "pop %%ebx;"     /* restore %ebx */
                       : "=m"(*addr), "=a"(result)
                       : "m"(*addr), "d" (old_val2), "a" (old_val1),
                         "c" (new_val2), "m" (new_val1) : "memory");
#else
  /* We can't just do the same thing in non-PIC mode, because GCC
   * might be using %ebx as the memory operand.  We could have ifdef'd
   * in a clobber, but there's no point doing the push/pop if we don't
   * have to. */
  __asm__ __volatile__("lock; cmpxchg8b %0; setz %1;"
                       : "=m"(*addr), "=a"(result)
                       : "m"(*addr), "d" (old_val2), "a" (old_val1),
                         "c" (new_val2), "b" (new_val1) : "memory");
#endif
  return (int) result;
}

#define AO_HAVE_compare_double_and_swap_double_full

#include "./ao_t_is_int.h"


================================================
FILE: stms/tinystm/src/atomic_ops/x86_64.h
================================================
/*
 * Copyright (c) 1991-1994 by Xerox Corporation.  All rights reserved.
 * Copyright (c) 1996-1999 by Silicon Graphics.  All rights reserved.
 * Copyright (c) 1999-2003 by Hewlett-Packard Company. All rights reserved.
 *
 *
 * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
 * OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
 *
 * Permission is hereby granted to use or copy this program
 * for any purpose,  provided the above notices are retained on all copies.
 * Permission to modify the code and to distribute modified code is granted,
 * provided the above notices are retained, and a notice that the code was
 * modified is included with the above copyright notice.
 *
 * Some of the machine specific code was borrowed from our GC distribution.
 */

#include "./aligned_atomic_load_store.h"

/* Real X86 implementations appear                                      */
/* to enforce ordering between memory operations, EXCEPT that a later   */
/* read can pass earlier writes, presumably due to the visible          */
/* presence of store buffers.                                           */
/* We ignore the fact that the official specs                           */
/* seem to be much weaker (and arguably too weak to be usable).         */

#include "./ordered_except_wr.h"

#include "./test_and_set_t_is_char.h"

#include "./standard_ao_double_t.h"

AO_INLINE void
AO_nop_full(void)
{
  /* Note: "mfence" (SSE2) is supported on all x86_64/amd64 chips.      */
  __asm__ __volatile__("mfence" : : : "memory");
}

#define AO_HAVE_nop_full

/* As far as we can tell, the lfence and sfence instructions are not    */
/* currently needed or useful for cached memory accesses.               */

AO_INLINE AO_t
AO_fetch_and_add_full (volatile AO_t *p, AO_t incr)
{
  AO_t result;

  __asm__ __volatile__ ("lock; xaddq %0, %1" :
                        "=r" (result), "=m" (*p) : "0" (incr), "m" (*p)
                        : "memory");
  return result;
}

#define AO_HAVE_fetch_and_add_full

AO_INLINE unsigned char
AO_char_fetch_and_add_full (volatile unsigned char *p, unsigned char incr)
{
  unsigned char result;

  __asm__ __volatile__ ("lock; xaddb %0, %1" :
                        "=q" (result), "=m" (*p) : "0" (incr), "m" (*p)
                        : "memory");
  return result;
}

#define AO_HAVE_char_fetch_and_add_full

AO_INLINE unsigned short
AO_short_fetch_and_add_full (volatile unsigned short *p, unsigned short incr)
{
  unsigned short result;

  __asm__ __volatile__ ("lock; xaddw %0, %1" :
                        "=r" (result), "=m" (*p) : "0" (incr), "m" (*p)
                        : "memory");
  return result;
}

#define AO_HAVE_short_fetch_and_add_full

AO_INLINE unsigned int
AO_int_fetch_and_add_full (volatile unsigned int *p, unsigned int incr)
{
  unsigned int result;

  __asm__ __volatile__ ("lock; xaddl %0, %1" :
                        "=r" (result), "=m" (*p) : "0" (incr), "m" (*p)
                        : "memory");
  return result;
}

#define AO_HAVE_int_fetch_and_add_full

AO_INLINE void
AO_or_full (volatile AO_t *p, AO_t incr)
{
  __asm__ __volatile__ ("lock; orq %1, %0" :
                        "=m" (*p) : "r" (incr), "m" (*p) : "memory");
}

#define AO_HAVE_or_full

AO_INLINE AO_TS_VAL_t
AO_test_and_set_full(volatile AO_TS_t *addr)
{
  unsigned char oldval;
  /* Note: the "xchg" instruction does not need a "lock" prefix */
  __asm__ __volatile__("xchgb %0, %1"
                : "=q"(oldval), "=m"(*addr)
                : "0"((unsigned char)0xff), "m"(*addr) : "memory");
  return (AO_TS_VAL_t)oldval;
}

#define AO_HAVE_test_and_set_full

/* Returns nonzero if the comparison succeeded. */
AO_INLINE int
AO_compare_and_swap_full(volatile AO_t *addr, AO_t old, AO_t new_val)
{
# ifdef AO_USE_SYNC_CAS_BUILTIN
    return (int)__sync_bool_compare_and_swap(addr, old, new_val);
# else
    char result;
    __asm__ __volatile__("lock; cmpxchgq %3, %0; setz %1"
                         : "=m" (*addr), "=a" (result)
                         : "m" (*addr), "r" (new_val), "a" (old) : "memory");
    return (int) result;
# endif
}

#define AO_HAVE_compare_and_swap_full

#ifdef AO_CMPXCHG16B_AVAILABLE
/* NEC LE-IT: older AMD Opterons are missing this instruction.
 * On these machines SIGILL will be thrown.
 * Define AO_WEAK_DOUBLE_CAS_EMULATION to have an emulated
 * (lock based) version available */
/* HB: Changed this to not define either by default.  There are
 * enough machines and tool chains around on which cmpxchg16b
 * doesn't work.  And the emulation is unsafe by our usual rules.
 * Hoewever both are clearly useful in certain cases.
 */
AO_INLINE int
AO_compare_double_and_swap_double_full(volatile AO_double_t *addr,
                                       AO_t old_val1, AO_t old_val2,
                                       AO_t new_val1, AO_t new_val2)
{
  char result;
  __asm__ __volatile__("lock; cmpxchg16b %0; setz %1"
                       : "=m"(*addr), "=a"(result)
                       : "m"(*addr), "d" (old_val2), "a" (old_val1),
                         "c" (new_val2), "b" (new_val1) : "memory");
  return (int) result;
}
#define AO_HAVE_compare_double_and_swap_double_full
#else
/* this one provides spinlock based emulation of CAS implemented in     */
/* atomic_ops.c.  We probably do not want to do this here, since it is  */
/* not atomic with respect to other kinds of updates of *addr.  On the  */
/* other hand, this may be a useful facility on occasion.               */
#ifdef AO_WEAK_DOUBLE_CAS_EMULATION
int AO_compare_double_and_swap_double_emulation(volatile AO_double_t *addr,
                                                AO_t old_val1, AO_t old_val2,
                                                AO_t new_val1, AO_t new_val2);

AO_INLINE int
AO_compare_double_and_swap_double_full(volatile AO_double_t *addr,
                                       AO_t old_val1, AO_t old_val2,
                                       AO_t new_val1, AO_t new_val2)
{
        return AO_compare_double_and_swap_double_emulation(addr,
                                                           old_val1, old_val2,
                                                           new_val1, new_val2);
}
#define AO_HAVE_compare_double_and_swap_double_full
#endif /* AO_WEAK_DOUBLE_CAS_EMULATION */
#endif /* AO_CMPXCHG16B_AVAILABLE */


================================================
FILE: stms/tinystm/src/gc.c
================================================
/*
 * File:
 *   gc.c
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * Description:
 *   Epoch-based garbage collector.
 *
 * Copyright (c) 2007-2014.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * This program has a dual license and can also be distributed
 * under the terms of the MIT license.
 */

#include <assert.h>
#include <stdio.h>
#include <stdint.h>

#include <pthread.h>

#include "tls.h"
#include "gc.h"

#include "atomic.h"
#include "stm.h"

/* TODO: could be made much more efficient by allocating large chunks. */

/* ################################################################### *
 * DEFINES
 * ################################################################### */

#define MAX_GC_THREADS                  1024
#define EPOCH_MAX                       (~(gc_word_t)0)

#ifndef NO_PERIODIC_CLEANUP
# ifndef CLEANUP_FREQUENCY
#  define CLEANUP_FREQUENCY             1
# endif /* ! CLEANUP_FREQUENCY */
#endif /* ! NO_PERIODIC_CLEANUP */

#ifdef DEBUG
/* Note: stdio is thread-safe */
# define IO_FLUSH                       fflush(NULL)
# define PRINT_DEBUG(...)               printf(__VA_ARGS__); fflush(NULL)
#else /* ! DEBUG */
# define IO_FLUSH
# define PRINT_DEBUG(...)
#endif /* ! DEBUG */

/* ################################################################### *
 * TYPES
 * ################################################################### */

enum {                                  /* Descriptor status */
  GC_NULL = 0,
  GC_BUSY = 1,
  GC_FREE_EMPTY = 2,
  GC_FREE_FULL = 3
};

typedef struct gc_block {               /* Block of allocated memory */
  void *addr;                           /* Address of memory */
  struct gc_block *next;                /* Next block */
} gc_block_t;

typedef struct gc_region {              /* A list of allocated memory blocks */
  struct gc_block *blocks;              /* Memory blocks */
  gc_word_t ts;                         /* Deallocation timestamp */
  struct gc_region *next;               /* Next region */
} gc_region_t;

typedef struct gc_thread {              /* Descriptor of an active thread */
  union {                               /* For padding... */
    struct {
      gc_word_t used;                   /* Is this entry used? */
      gc_word_t ts;                     /* Start timestamp */
      gc_region_t *head;                /* First memory region(s) assigned to thread */
      gc_region_t *tail;                /* Last memory region(s) assigned to thread */
#ifndef NO_PERIODIC_CLEANUP
      unsigned int frees;               /* How many blocks have been freed? */
#endif /* ! NO_PERIODIC_CLEANUP */
    };
    char padding[CACHELINE_SIZE];       /* Padding (should be at least a cache line) */
  };
} gc_thread_t;

static struct {                         /* Descriptors of active threads */
  volatile gc_thread_t *slots;          /* Array of thread slots */
  volatile gc_word_t nb_active;         /* Number of used thread slots */
} gc_threads;

static gc_word_t (*gc_current_epoch)(void); /* Read the value of the current epoch */

/* ################################################################### *
 * STATIC
 * ################################################################### */

/*
 * Returns the index of the CURRENT thread.
 */
static inline int gc_get_idx(void)
{
  return tls_get_gc();
}

/*
 * Compute a lower bound on the minimum start time of all active transactions.
 */
static inline gc_word_t gc_compute_min(gc_word_t now)
{
  int i;
  gc_word_t min, ts;
  stm_word_t used;

  PRINT_DEBUG("==> gc_compute_min(%d)\n", gc_get_idx());

  min = now;
  for (i = 0; i < MAX_GC_THREADS; ) {
    used = (gc_word_t)ATOMIC_LOAD(&gc_threads.slots[i].used);
    if (used == GC_BUSY) {
      /* Used entry */
      ts = (gc_word_t)ATOMIC_LOAD(&gc_threads.slots[i].ts);
      if (ts == EPOCH_MAX) {
        /* Wait until thread has set a safe lower bound */
        continue;
      }
      if (ts < min)
        min = ts;
    } else if (used == GC_NULL) {
      /* No more threads */
      break;
    }
    /* Move to next entry only if entry is not used or it has a safe lower bound */
    i++;
  }

  PRINT_DEBUG("==> gc_compute_min(%d,m=%lu)\n", gc_get_idx(), (unsigned long)min);

  return min;
}

/*
 * Free block list.
 */
static inline void gc_clean_blocks(gc_block_t *mb)
{
  gc_block_t *next_mb;

  while (mb != NULL) {
    PRINT_DEBUG("==> free(%d,a=%p)\n", gc_get_idx(), mb->addr);
    xfree(mb->addr);
    next_mb = mb->next;
    xfree(mb);
    mb = next_mb;
  }
}

/*
 * Free region list.
 */
static inline void gc_clean_regions(gc_region_t *mr)
{
  gc_region_t *next_mr;

  while (mr != NULL) {
    gc_clean_blocks(mr->blocks);
    next_mr = mr->next;
    xfree(mr);
    mr = next_mr;
  }
}

/*
 * Garbage-collect old data associated with a thread.
 */
void gc_cleanup_thread(int idx, gc_word_t min)
{
  gc_region_t *mr;

  PRINT_DEBUG("==> gc_cleanup_thread(%d,m=%lu)\n", idx, (unsigned long)min);

  if (gc_threads.slots[idx].head == NULL) {
    /* Nothing to clean up */
    return;
  }

  while (min > gc_threads.slots[idx].head->ts) {
    gc_clean_blocks(gc_threads.slots[idx].head->blocks);
    mr = gc_threads.slots[idx].head->next;
    xfree(gc_threads.slots[idx].head);
    gc_threads.slots[idx].head = mr;
    if(mr == NULL) {
      /* All memory regions deleted */
      gc_threads.slots[idx].tail = NULL;
      break;
    }
  }
}

/* ################################################################### *
 * FUNCTIONS
 * ################################################################### */

/*
 * Initialize GC library (to be called from main thread).
 */
void gc_init(gc_word_t (*epoch)(void))
{
  int i;

  PRINT_DEBUG("==> gc_init()\n");

  gc_current_epoch = epoch;
  gc_threads.slots = (gc_thread_t *)xmalloc(MAX_GC_THREADS * sizeof(gc_thread_t));
  for (i = 0; i < MAX_GC_THREADS; i++) {
    gc_threads.slots[i].used = GC_NULL;
    gc_threads.slots[i].ts = EPOCH_MAX;
    gc_threads.slots[i].head = gc_threads.slots[i].tail = NULL;
#ifndef NO_PERIODIC_CLEANUP
    gc_threads.slots[i].frees = 0;
#endif /* ! NO_PERIODIC_CLEANUP */
  }
  gc_threads.nb_active = 0;
}

/*
 * Clean up GC library (to be called from main thread).
 */
void gc_exit(void)
{
  int i;

  PRINT_DEBUG("==> gc_exit()\n");

  /* Make sure that all threads have been stopped */
  if (ATOMIC_LOAD(&gc_threads.nb_active) != 0) {
    fprintf(stderr, "Error: some threads have not been cleaned up\n");
    exit(1);
  }
  /* Clean up memory */
  for (i = 0; i < MAX_GC_THREADS; i++)
    gc_clean_regions(gc_threads.slots[i].head);

  xfree((void *)gc_threads.slots);
}

/*
 * Initialize thread-specific GC resources (to be called once by each thread).
 */
void gc_init_thread(void)
{
  int i, idx;
  gc_word_t used;

  PRINT_DEBUG("==> gc_init_thread()\n");

  if (ATOMIC_FETCH_INC_FULL(&gc_threads.nb_active) >= MAX_GC_THREADS) {
    fprintf(stderr, "Error: too many concurrent threads created\n");
    exit(1);
  }
  /* Find entry in threads array */
  i = 0;
  /* TODO: not wait-free */
  while (1) {
    used = (gc_word_t)ATOMIC_LOAD(&gc_threads.slots[i].used);
    if (used != GC_BUSY) {
      if (ATOMIC_CAS_FULL(&gc_threads.slots[i].used, used, GC_BUSY) != 0) {
        idx = i;
        /* Set safe lower bound */
        ATOMIC_STORE(&gc_threads.slots[idx].ts, gc_current_epoch());
        break;
      }
      /* CAS failed: another thread must have acquired slot */
      assert (gc_threads.slots[i].used != GC_NULL);
    }
    if (++i >= MAX_GC_THREADS)
      i = 0;
  }
  tls_set_gc(idx);

  PRINT_DEBUG("==> gc_init_thread(i=%d)\n", idx);
}

/*
 * Clean up thread-specific GC resources (to be called once by each thread).
 */
void gc_exit_thread(void)
{
  int idx = gc_get_idx();
  /* NOTA: if gc_exit_thread is not called when it finishes, others threads will not free chunks. */

  PRINT_DEBUG("==> gc_exit_thread(%d)\n", idx);

  /* No more lower bound for this thread */
  ATOMIC_STORE(&gc_threads.slots[idx].ts, EPOCH_MAX);
  /* Release slot */
  ATOMIC_STORE(&gc_threads.slots[idx].used, gc_threads.slots[idx].head == NULL ? GC_FREE_EMPTY : GC_FREE_FULL);
  ATOMIC_FETCH_DEC_FULL(&gc_threads.nb_active);
  /* Leave memory for next thread to cleanup */
}

/*
 * Set new epoch (to be called by each thread, typically when starting
 * new transactions to indicate their start timestamp).
 */
void gc_set_epoch(gc_word_t epoch)
{
  int idx = gc_get_idx();

  PRINT_DEBUG("==> gc_set_epoch(%d,%lu)\n", idx, (unsigned long)epoch);

  if (epoch >= EPOCH_MAX) {
    fprintf(stderr, "Exceeded maximum epoch number: 0x%lx\n", (unsigned long)epoch);
    /* Do nothing (will prevent data from being garbage collected) */
    return;
  }

  /* Do not need a barrier as we only compute lower bounds */
  ATOMIC_STORE(&gc_threads.slots[idx].ts, epoch);
}

/*
 * Free memory (the thread must indicate the current timestamp).
 */
void gc_free(void *addr, gc_word_t epoch)
{
  gc_region_t *mr;
  gc_block_t *mb;
  int idx = gc_get_idx();

  PRINT_DEBUG("==> gc_free(%d,%lu)\n", idx, (unsigned long)epoch);

  /* Function must be called with non-decreasing epoch numbers for any given thread! */
  if (gc_threads.slots[idx].head == NULL || gc_threads.slots[idx].tail->ts < epoch) {
    /* Allocate a new region */
    mr = (gc_region_t *)xmalloc(sizeof(gc_region_t));
    mr->ts = epoch;
    mr->blocks = NULL;
    mr->next = NULL;
    if (gc_threads.slots[idx].head == NULL) {
      gc_threads.slots[idx].head = gc_threads.slots[idx].tail = mr;
    } else {
      gc_threads.slots[idx].tail->next = mr;
      gc_threads.slots[idx].tail = mr;
    }
  } else {
    /* Add to current region */
    assert(gc_threads.slots[idx].tail->ts == epoch);
    mr = gc_threads.slots[idx].tail;
  }

  /* Allocate block */
  mb = (gc_block_t *)xmalloc(sizeof(gc_block_t));
  mb->addr = addr;
  mb->next = mr->blocks;
  mr->blocks = mb;

#ifndef NO_PERIODIC_CLEANUP
  gc_threads.slots[idx].frees++;
  if (gc_threads.slots[idx].frees % CLEANUP_FREQUENCY == 0)
    gc_cleanup();
#endif /* ! NO_PERIODIC_CLEANUP */
}

/*
 * Garbage-collect old data associated with the current thread (should
 * be called periodically).
 */
void gc_cleanup(void)
{
  gc_word_t min;
  int idx = gc_get_idx();

  PRINT_DEBUG("==> gc_cleanup(%d)\n", idx);

  if (gc_threads.slots[idx].head == NULL) {
    /* Nothing to clean up */
    return;
  }

  min = gc_compute_min(gc_current_epoch());

  gc_cleanup_thread(idx, min);
}

/*
 * Garbage-collect old data associated with all threads (should be
 * called periodically).
 */
void gc_cleanup_all(void)
{
  int i;
  gc_word_t min = EPOCH_MAX;

  PRINT_DEBUG("==> gc_cleanup_all()\n");

  for (i = 0; i < MAX_GC_THREADS; i++) {
    if ((gc_word_t)ATOMIC_LOAD(&gc_threads.slots[i].used) == GC_NULL)
      break;
    if ((gc_word_t)ATOMIC_LOAD(&gc_threads.slots[i].used) == GC_FREE_FULL) {
      if (ATOMIC_CAS_FULL(&gc_threads.slots[i].used, GC_FREE_FULL, GC_BUSY) != 0) {
        if (min == EPOCH_MAX)
          min = gc_compute_min(gc_current_epoch());
        gc_cleanup_thread(i, min);
        ATOMIC_STORE(&gc_threads.slots[i].used, gc_threads.slots[i].head == NULL ? GC_FREE_EMPTY : GC_FREE_FULL);
      }
    }
  }
}

/*
 * Reset all epochs for all threads (must be called with all threads
 * stopped and out of transactions, e.g., upon roll-over).
 */
void gc_reset(void)
{
  int i;

  PRINT_DEBUG("==> gc_reset()\n");

  for (i = 0; i < MAX_GC_THREADS; i++) {
    if (gc_threads.slots[i].used == GC_NULL)
      break;
    gc_clean_regions(gc_threads.slots[i].head);
    gc_threads.slots[i].ts = EPOCH_MAX;
    gc_threads.slots[i].head = gc_threads.slots[i].tail = NULL;
#ifndef NO_PERIODIC_CLEANUP
    gc_threads.slots[i].frees = 0;
#endif /* ! NO_PERIODIC_CLEANUP */
  }
}


================================================
FILE: stms/tinystm/src/gc.h
================================================
/*
 * File:
 *   gc.h
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * Description:
 *   Epoch-based garbage collector.
 *
 * Copyright (c) 2007-2014.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * This program has a dual license and can also be distributed
 * under the terms of the MIT license.
 */

#ifndef _GC_H_
# define _GC_H_

# include <stdlib.h>
# include <stdint.h>

# ifdef __cplusplus
extern "C" {
# endif

typedef uintptr_t gc_word_t;

void gc_init(gc_word_t (*epoch)(void));
void gc_exit(void);

void gc_init_thread(void);
void gc_exit_thread(void);

void gc_set_epoch(gc_word_t epoch);

void gc_free(void *addr, gc_word_t epoch);

void gc_cleanup(void);

void gc_cleanup_all(void);

void gc_reset(void);

# ifdef __cplusplus
}
# endif

#endif /* _GC_H_ */


================================================
FILE: stms/tinystm/src/mod_ab.c
================================================
/*
 * File:
 *   mod_ab.c
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * Description:
 *   Module for gathering statistics about atomic blocks.
 *
 * Copyright (c) 2007-2014.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * This program has a dual license and can also be distributed
 * under the terms of the MIT license.
 */

#include <assert.h>
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <sys/time.h>

#include <pthread.h>

#include "mod_ab.h"

#include "atomic.h"
#include "stm.h"
#include "utils.h"

/* ################################################################### *
 * TYPES
 * ################################################################### */

#define NB_ATOMIC_BLOCKS                64
#define BUFFER_SIZE                     1024
#define RESERVOIR_SIZE                  "RESERVOIR_SIZE"
#define RESERVOIR_SIZE_DEFAULT          1000
#define SAMPLING_PERIOD_DEFAULT         1024

typedef struct smart_counter {          /* Smart counter */
  unsigned long samples;                /* Number of samples */
  double mean;                          /* Mean */
  double variance;                      /* Variance */
  double min;                           /* Minimum */
  double max;                           /* Maximum */
  double *reservoir;                    /* Vitter's reservoir */
  int sorted;                           /* Is the reservoir sorted? */
} smart_counter_t;

typedef struct ab_stats {               /* Atomic block statistics */
  int id;                               /* Atomic block identifier */
  struct ab_stats *next;                /* Next atomic block */
  smart_counter_t stats;                /* Length statistics */
} ab_stats_t;

typedef struct samples_buffer {         /* Buffer to hold samples */
  struct {
    int id;                             /* Atomic block identifier */
    unsigned long length;               /* Transaction length */
  } buffer[BUFFER_SIZE];                /* Buffer */
  unsigned int nb;                      /* Number of samples */
  unsigned int total;                   /* Total number of valid samples seen by thread so far */
  uint64_t start;                       /* Start time of the current transaction */
  unsigned short seed[3];               /* Thread-local PNRG's seed */
} samples_buffer_t;

static int mod_ab_key;
static int mod_ab_initialized = 0;
static int sampling_period;             /* Inverse sampling frequency */
static unsigned int reservoir_size;     /* Size of Vitter's reservoir */
static int (*check_fn)(void);           /* Function to check sample validity */

static unsigned int seed;               /* Global PNRG's seed */

static pthread_mutex_t ab_mutex;        /* Mutex to update global statistics */

static ab_stats_t *ab_list[NB_ATOMIC_BLOCKS];

/* ################################################################### *
 * FUNCTIONS
 * ################################################################### */

/*
 * Round double to int.
 */
static int sc_round(double d)
{
  return (int)(d + 0.5);
}

/*
 * Compare doubles.
 */
static int compare_doubles(const void *a, const void *b)
{
  const double *da = (const double *)a;
  const double *db = (const double *)b;
  return (*da < *db ? -1 : (*da > *db ? 1 : 0));
}

/*
 * Initialize smart counter.
 */
static void sc_init(smart_counter_t *c)
{
  c->samples = 0;
  c->mean = c->variance = c->min = c->max = 0;
  c->sorted = 1;
  c->reservoir = (double *)xmalloc(reservoir_size * sizeof(double));
}

/*
 * Add sample in smart counter.
 */
static void sc_add_sample(smart_counter_t *c, double n, unsigned short *seed)
{
  double prev, prob;

  /* Update mean, variance, min, max */
  prev = c->mean;
  if (c->samples == 0)
    c->min = c->max = n;
  else if (n < c->min)
    c->min = n;
  else if (n > c->max)
    c->max = n;
  c->mean = c->mean + (n - c->mean) / (double)(c->samples + 1);
  c->variance = c->variance + (n - prev) * (n - c->mean);

  /* Add sample to reservoir */
  if (c->samples < reservoir_size) {
    c->reservoir[c->samples] = n;
    c->sorted = 0;
  } else {
    prob = reservoir_size / (double)(c->samples + 1);
    if (erand48(seed) <= prob) {
      /* Replace random element */
      c->reservoir[nrand48(seed) % reservoir_size] = n;
    }
    c->sorted = 0;
  }

  c->samples++;
}

/*
 * Get number of samples of smart counter.
 */
static unsigned long sc_samples(smart_counter_t *c)
{
  return c->samples;
}

/*
 * Get mean of smart counter.
 */
static double sc_mean(smart_counter_t *c)
{
  return c->mean;
}

/*
 * Get variance of smart counter.
 */
static double sc_variance(smart_counter_t *c)
{
  if(c->samples <= 1)
    return 0.0;
  return c->variance / (c->samples - 1);
}

/*
 * Get min of smart counter.
 */
static double sc_min(smart_counter_t *c)
{
  return c->min;
}

/*
 * Get max of smart counter.
 */
static double sc_max(smart_counter_t *c)
{
  return c->max;
}

/*
 * Get specific percentile.
 */
static double sc_percentile(smart_counter_t *c, int percentile)
{
  int length, i;

  length = c->samples < reservoir_size ? c->samples : reservoir_size;
  i = sc_round(length * percentile / 100.0);

  if (i <= 0)
    return c->min;
  if (i >= length)
    return c->max;

  /* Sort array (if not yet sorted) */
  if (!c->sorted) {
    qsort(c->reservoir, length, sizeof(double), compare_doubles);
    c->sorted = 1;
  }

  return c->reservoir[i];
}

/*
 * Returns a time measurement (clock ticks for x86).
 */
static inline uint64_t get_time(void) {
#if defined(__x86_64__) || defined(__amd64__) || defined(__i386__)
  uint32_t lo, hi;
  /* Note across cores the counter is not fully synchronized.
   * The serializing instruction is rdtscp. */
  __asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
  /* __asm__ __volatile__("rdtscp" : "=a"(lo), "=d"(hi) :: "ecx" ); */
  return (((uint64_t)hi) << 32) | (((uint64_t)lo) & 0xffffffff);
#else
  struct timeval tv;
  gettimeofday(&tv, NULL);
  return (uint64_t)(tv.tv_sec * 1000000 + tv.tv_usec);
#endif
}

/*
 * Add samples to global stats.
 */
static void sc_add_samples(samples_buffer_t *samples)
{
  int i, id, bucket;
  ab_stats_t *ab;

  pthread_mutex_lock(&ab_mutex);
  for (i = 0; i < samples->nb; i++) {
    id = samples->buffer[i].id;
    /* Find bucket */
    bucket = abs(id) % NB_ATOMIC_BLOCKS;
    /* Search for entry in bucket */
    ab = ab_list[bucket];
    while (ab != NULL && ab->id != id)
      ab = ab->next;
    if (ab == NULL) {
      /* No entry yet: create one */
      ab = (ab_stats_t *)xmalloc(sizeof(ab_stats_t));
      ab->id = id;
      ab->next = ab_list[bucket];
      sc_init(&ab->stats);
      ab_list[bucket] = ab;
    }
    sc_add_sample(&ab->stats, (double)samples->buffer[i].length, samples->seed);
  }
  samples->nb = 0;
  pthread_mutex_unlock(&ab_mutex);
}

/*
 * Clean up module.
 */
static void cleanup(void)
{
  int i;
  ab_stats_t *ab, *n;

  pthread_mutex_lock(&ab_mutex);
  for (i = 0; i < NB_ATOMIC_BLOCKS; i++) {
    ab = ab_list[i];
    while (ab != NULL) {
      n = ab->next;
      xfree(ab->stats.reservoir);
      xfree(ab);
      ab = n;
    }
  }
  pthread_mutex_unlock(&ab_mutex);

  pthread_mutex_destroy(&ab_mutex);
}

/*
 * Called upon thread creation.
 */
static void mod_ab_on_thread_init(void *arg)
{
  samples_buffer_t *samples;

  samples = (samples_buffer_t *)xmalloc(sizeof(samples_buffer_t));
  samples->nb = 0;
  samples->total = 0;
  /* Initialize thread-local seed in mutual exclution */
  pthread_mutex_lock(&ab_mutex);
  samples->seed[0] = (unsigned short)rand_r(&seed);
  samples->seed[1] = (unsigned short)rand_r(&seed);
  samples->seed[2] = (unsigned short)rand_r(&seed);
  pthread_mutex_unlock(&ab_mutex);
  stm_set_specific(mod_ab_key, samples);
}

/*
 * Called upon thread deletion.
 */
static void mod_ab_on_thread_exit(void *arg)
{
  samples_buffer_t *samples;

  samples = (samples_buffer_t *)stm_get_specific(mod_ab_key);
  assert(samples != NULL);

  sc_add_samples(samples);

  xfree(samples);
}

/*
 * Called upon transaction start.
 */
static void mod_ab_on_start(void *arg)
{
  samples_buffer_t *samples;

  samples = (samples_buffer_t *)stm_get_specific(mod_ab_key);
  assert(samples != NULL);

  samples->start = get_time();
}

/*
 * Called upon transaction commit.
 */
static void mod_ab_on_commit(void *arg)
{
  samples_buffer_t *samples;
  stm_tx_attr_t attrs;
  unsigned long length;

  samples = (samples_buffer_t *)stm_get_specific(mod_ab_key);
  assert(samples != NULL);

  if (check_fn == NULL || check_fn()) {
    length = get_time() - samples->start;
    samples->total++;
    /* Should be keep this sample? */
    if ((samples->total % sampling_period) == 0) {
      attrs = stm_get_attributes();
      samples->buffer[samples->nb].id = attrs.id;
      samples->buffer[samples->nb].length = length;
      /* Is buffer full? */
      if (++samples->nb == BUFFER_SIZE) {
        /* Accumulate in global stats (and empty buffer) */
        sc_add_samples(samples);
      }
    }
  }
}

/*
 * Called upon transaction abort.
 */
static void mod_ab_on_abort(void *arg)
{
  samples_buffer_t *samples;

  samples = (samples_buffer_t *)stm_get_specific(mod_ab_key);
  assert(samples != NULL);

  samples->start = get_time();
}

/*
 * Return statistics about atomic block.
 */
int stm_get_ab_stats(int id, stm_ab_stats_t *stats)
{
  int bucket, result;
  ab_stats_t *ab;

  result = 0;
  pthread_mutex_lock(&ab_mutex);
  /* Find bucket */
  bucket = abs(id) % NB_ATOMIC_BLOCKS;
  /* Search for entry in bucket */
  ab = ab_list[bucket];
  while (ab != NULL && ab->id != id)
    ab = ab->next;
  if (ab != NULL) {
    stats->samples = sc_samples(&ab->stats);
    stats->mean = sc_mean(&ab->stats);
    stats->variance = sc_variance(&ab->stats);
    stats->min = sc_min(&ab->stats);
    stats->max = sc_max(&ab->stats);
    stats->percentile_50 = sc_percentile(&ab->stats, 50);
    stats->percentile_90 = sc_percentile(&ab->stats, 90);
    stats->percentile_95 = sc_percentile(&ab->stats, 95);
    /* At this point, the reservoir is sorted */
    stats->reservoir = ab->stats.reservoir;
    stats->reservoir_size = ab->stats.samples < reservoir_size ? ab->stats.samples : reservoir_size;
    result = 1;
  }
  pthread_mutex_unlock(&ab_mutex);

  return result;
}

/*
 * Initialize module.
 */
void mod_ab_init(int freq, int (*check)(void))
{
  int i;
  char *s;

  if (mod_ab_initialized)
    return;

  /* Use random seed (we are in main thread) */
  seed = (unsigned int)rand();

  sampling_period = (freq <= 0 ? SAMPLING_PERIOD_DEFAULT : freq);
  s = getenv(RESERVOIR_SIZE);
  if (s != NULL)
    reservoir_size = (unsigned int)strtol(s, NULL, 10);
  else
    reservoir_size = RESERVOIR_SIZE_DEFAULT;
  check_fn = check;

  if (!stm_register(mod_ab_on_thread_init, mod_ab_on_thread_exit, mod_ab_on_start, NULL, mod_ab_on_commit, mod_ab_on_abort, NULL)) {
    fprintf(stderr, "Cannot register callbacks\n");
    exit(1);
  }
  mod_ab_key = stm_create_specific();
  if (mod_ab_key < 0) {
    fprintf(stderr, "Cannot create specific key\n");
    exit(1);
  }
  if (pthread_mutex_init(&ab_mutex, NULL) != 0) {
    fprintf(stderr, "Error creating mutex\n");
    exit(1);
  }
  for (i = 0; i < NB_ATOMIC_BLOCKS; i++)
    ab_list[i] = NULL;
  atexit(cleanup);
  mod_ab_initialized = 1;
}


================================================
FILE: stms/tinystm/src/mod_cb_mem.c
================================================
/*
 * File:
 *   mod_cb_mem.c
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * Description:
 *   Module for user callback and for dynamic memory management.
 *
 * Copyright (c) 2007-2014.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * This program has a dual license and can also be distributed
 * under the terms of the MIT license.
 */

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>

#include "mod_cb.h"
#include "mod_mem.h"

/* TODO use stm_internal.h for faster accesses */
#include "stm.h"
#include "utils.h"
#include "gc.h"


/* ################################################################### *
 * TYPES
 * ################################################################### */
#define DEFAULT_CB_SIZE                 16

typedef struct mod_cb_entry {           /* Callback entry */
  void (*f)(void *);                    /* Function */
  void *arg;                            /* Argument to be passed to function */
} mod_cb_entry_t;

typedef struct mod_cb_info {
  unsigned short commit_size;           /* Array size for commit callbacks */
  unsigned short commit_nb;             /* Number of commit callbacks */
  mod_cb_entry_t *commit;               /* Commit callback entries */
  unsigned short abort_size;            /* Array size for abort callbacks */
  unsigned short abort_nb;              /* Number of abort callbacks */
  mod_cb_entry_t *abort;                /* Abort callback entries */
} mod_cb_info_t;

/* TODO: to avoid false sharing, this should be in a dedicated cacheline.
 * Unfortunately this will cost one cache line for each module. Probably
 * mod_cb_mem could be included always in mainline stm since allocation is
 * common in transaction (?). */
static union {
  struct {
    int key;
    unsigned int use_gc;
  };
  char padding[CACHELINE_SIZE];
} ALIGNED mod_cb = {{.key = -1}};

/* ################################################################### *
 * CALLBACKS FUNCTIONS
 * ################################################################### */

static INLINE void
mod_cb_add_on_abort(mod_cb_info_t *icb, void (*f)(void *arg), void *arg)
{
  if (unlikely(icb->abort_nb >= icb->abort_size)) {
    icb->abort_size *= 2;
    icb->abort = xrealloc(icb->abort, sizeof(mod_cb_entry_t) * icb->abort_size);
  }
  icb->abort[icb->abort_nb].f = f;
  icb->abort[icb->abort_nb].arg = arg;
  icb->abort_nb++;
}

static INLINE void
mod_cb_add_on_commit(mod_cb_info_t *icb, void (*f)(void *arg), void *arg)
{
  if (unlikely(icb->commit_nb >= icb->commit_size)) {
    icb->commit_size *= 2;
    icb->commit = xrealloc(icb->commit, sizeof(mod_cb_entry_t) * icb->commit_size);
  }
  icb->commit[icb->commit_nb].f = f;
  icb->commit[icb->commit_nb].arg = arg;
  icb->commit_nb++;
}

/*
 * Register abort callback for the CURRENT transaction.
 */
int stm_on_abort(void (*on_abort)(void *arg), void *arg)
{
  mod_cb_info_t *icb;

  assert(mod_cb.key >= 0);
  icb = (mod_cb_info_t *)stm_get_specific(mod_cb.key);
  assert(icb != NULL);

  mod_cb_add_on_abort(icb, on_abort, arg);

  return 1;
}

/*
 * Register commit callback for the CURRENT transaction.
 */
int stm_on_commit(void (*on_commit)(void *arg), void *arg)
{
  mod_cb_info_t *icb;

  assert(mod_cb.key >= 0);
  icb = (mod_cb_info_t *)stm_get_specific(mod_cb.key);
  assert(icb != NULL);

  mod_cb_add_on_commit(icb, on_commit, arg);

  return 1;
}

/* ################################################################### *
 * MEMORY ALLOCATION FUNCTIONS
 * ################################################################### */
static INLINE void *
int_stm_malloc(struct stm_tx *tx, size_t size)
{
  /* Memory will be freed upon abort */
  mod_cb_info_t *icb;
  void *addr;

  assert(mod_cb.key >= 0);
  icb = (mod_cb_info_t *)stm_get_specific(mod_cb.key);
  assert(icb != NULL);

  /* Round up size */
  if (sizeof(stm_word_t) == 4) {
    size = (size + 3) & ~(size_t)0x03;
  } else {
    size = (size + 7) & ~(size_t)0x07;
  }

  addr = xmalloc(size);

  mod_cb_add_on_abort(icb, free, addr);

  return addr;
}

/*
 * Called by the CURRENT thread to allocate memory within a transaction.
 */
void *stm_malloc(size_t size)
{
  struct stm_tx *tx = stm_current_tx();
  return int_stm_malloc(tx, size);
}

void *stm_malloc_tx(struct stm_tx *tx, size_t size)
{
  return int_stm_malloc(tx, size);
}

static inline
void *int_stm_calloc(struct stm_tx *tx, size_t nm, size_t size)
{
  /* Memory will be freed upon abort */
  mod_cb_info_t *icb;
  void *addr;

  assert(mod_cb.key >= 0);
  icb = (mod_cb_info_t *)stm_get_specific(mod_cb.key);
  assert(icb != NULL);

  /* Round up size */
  if (sizeof(stm_word_t) == 4) {
    size = (size + 3) & ~(size_t)0x03;
  } else {
    size = (size + 7) & ~(size_t)0x07;
  }

  addr = xcalloc(nm, size);

  mod_cb_add_on_abort(icb, free, addr);

  return addr;
}

/*
 * Called by the CURRENT thread to allocate initialized memory within a transaction.
 */
void *stm_calloc(size_t nm, size_t size)
{
  struct stm_tx *tx = stm_current_tx();
  return int_stm_calloc(tx, nm, size);
}

void *stm_calloc_tx(struct stm_tx *tx, size_t nm, size_t size)
{
  return int_stm_calloc(tx, nm, size);
}

#ifdef EPOCH_GC
static void
epoch_free(void *addr)
{
  if (mod_cb.use_gc) {
    /* TODO use tx->end could be also used */
    stm_word_t t = stm_get_clock();
    gc_free(addr, t);
  } else {
    xfree(addr);
  }
}
#endif /* EPOCH_GC */

static inline
void int_stm_free2(struct stm_tx *tx, void *addr, size_t idx, size_t size)
{
  /* Memory disposal is delayed until commit */
  mod_cb_info_t *icb;

  assert(mod_cb.key >= 0);
  icb = (mod_cb_info_t *)stm_get_specific(mod_cb.key);
  assert(icb != NULL);

  /* TODO: if block allocated in same transaction => no need to overwrite */
  if (size > 0) {
    stm_word_t *a;
    /* Overwrite to prevent inconsistent reads */
    if (sizeof(stm_word_t) == 4) {
      idx = (idx + 3) >> 2;
      size = (size + 3) >> 2;
    } else {
      idx = (idx + 7) >> 3;
      size = (size + 7) >> 3;
    }
    a = (stm_word_t *)addr + idx;
    while (size-- > 0) {
      /* Acquire lock and update version number */
      stm_store2_tx(tx, a++, 0, 0);
    }
  }
  /* Schedule for removal */
#ifdef EPOCH_GC
  mod_cb_add_on_commit(icb, epoch_free, addr);
#else /* ! EPOCH_GC */
  mod_cb_add_on_commit(icb, free, addr);
#endif /* ! EPOCH_GC */
}

/*
 * Called by the CURRENT thread to free memory within a transaction.
 */
void stm_free2(void *addr, size_t idx, size_t size)
{
  struct stm_tx *tx = stm_current_tx();
  int_stm_free2(tx, addr, idx, size);
}

void stm_free2_tx(struct stm_tx *tx, void *addr, size_t idx, size_t size)
{
  int_stm_free2(tx, addr, idx, size);
}

/*
 * Called by the CURRENT thread to free memory within a transaction.
 */
void stm_free(void *addr, size_t size)
{
  struct stm_tx *tx = stm_current_tx();
  int_stm_free2(tx, addr, 0, size);
}

void stm_free_tx(struct stm_tx *tx, void *addr, size_t size)
{
  int_stm_free2(tx, addr, 0, size);
}


/*
 * Called upon transaction commit.
 */
static void mod_cb_on_commit(void *arg)
{
  mod_cb_info_t *icb;

  icb = (mod_cb_info_t *)stm_get_specific(mod_cb.key);
  assert(icb != NULL);

  /* Call commit callback */
  while (icb->commit_nb > 0) {
    icb->commit_nb--;
    icb->commit[icb->commit_nb].f(icb->commit[icb->commit_nb].arg);
  }
  /* Reset abort callback */
  icb->abort_nb = 0;
}

/*
 * Called upon transaction abort.
 */
static void mod_cb_on_abort(void *arg)
{
  mod_cb_info_t *icb;

  icb = (mod_cb_info_t *)stm_get_specific(mod_cb.key);
  assert(icb != NULL);

  /* Call abort callback */
  while (icb->abort_nb > 0) {
    icb->abort_nb--;
    icb->abort[icb->abort_nb].f(icb->abort[icb->abort_nb].arg);
  }
  /* Reset commit callback */
  icb->commit_nb = 0;
}

/*
 * Called upon thread creation.
 */
static void mod_cb_on_thread_init(void *arg)
{
  mod_cb_info_t *icb;

  icb = (mod_cb_info_t *)xmalloc(sizeof(mod_cb_info_t));
  icb->commit_nb = icb->abort_nb = 0;
  icb->commit_size = icb->abort_size = DEFAULT_CB_SIZE;
  icb->commit = xmalloc(sizeof(mod_cb_entry_t) * icb->commit_size);
  icb->abort = xmalloc(sizeof(mod_cb_entry_t) * icb->abort_size);

  stm_set_specific(mod_cb.key, icb);
}

/*
 * Called upon thread deletion.
 */
static void mod_cb_on_thread_exit(void *arg)
{
  mod_cb_info_t *icb;

  icb = (mod_cb_info_t *)stm_get_specific(mod_cb.key);
  assert(icb != NULL);

  xfree(icb->abort);
  xfree(icb->commit);
  xfree(icb);
}

static INLINE void
mod_cb_mem_init(void)
{
  /* Module is already initialized? */
  if (mod_cb.key >= 0)
    return;

  if (!stm_register(mod_cb_on_thread_init, mod_cb_on_thread_exit, NULL, NULL, mod_cb_on_commit, mod_cb_on_abort, NULL)) {
    fprintf(stderr, "Cannot register callbacks\n");
    exit(1);
  }
  mod_cb.key = stm_create_specific();
  if (mod_cb.key < 0) {
    fprintf(stderr, "Cannot create specific key\n");
    exit(1);
  }
}

/*
 * Initialize module.
 */
void mod_cb_init(void)
{
  mod_cb_mem_init();
}

void mod_mem_init(int use_gc)
{
  mod_cb_mem_init();
#ifdef EPOCH_GC
  mod_cb.use_gc = use_gc;
#endif /* EPOCH_GC */
}


================================================
FILE: stms/tinystm/src/mod_log.c
================================================
/*
 * File:
 *   mod_log.c
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * Description:
 *   Module for logging memory accesses.
 *
 * Copyright (c) 2007-2014.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * This program has a dual license and can also be distributed
 * under the terms of the MIT license.
 */

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "mod_log.h"

#include "stm.h"
#include "utils.h"

#ifndef LW_SET_SIZE
# define LW_SET_SIZE                    1024
#endif /* ! LW_SET_SIZE */

/* ################################################################### *
 * TYPES
 * ################################################################### */

enum {
  TYPE_WORD,
  TYPE_U8,
  TYPE_U16,
  TYPE_U32,
  TYPE_U64,
  TYPE_CHAR,
  TYPE_UCHAR,
  TYPE_SHORT,
  TYPE_USHORT,
  TYPE_INT,
  TYPE_UINT,
  TYPE_LONG,
  TYPE_ULONG,
  TYPE_FLOAT,
  TYPE_DOUBLE,
  TYPE_PTR,
  TYPE_BYTES
};

typedef struct mod_log_w_entry {        /* Write set entry */
  int type;                             /* Data type */
  union {                               /* Address written and old value */
    struct { stm_word_t *a; stm_word_t v; } w;
    struct { uint8_t *a; char v; } u8;
    struct { uint16_t *a; char v; } u16;
    struct { uint32_t *a; char v; } u32;
    struct { uint64_t *a; char v; } u64;
    struct { char *a; char v; } c;
    struct { unsigned char *a; unsigned char v; } uc;
    struct { short *a; short v; } s;
    struct { unsigned short *a; unsigned short v; } us;
    struct { int *a; int v; } i;
    struct { unsigned int *a; unsigned int v; } ui;
    struct { long *a; long v; } l;
    struct { unsigned long *a; unsigned long v; } ul;
    struct { float *a; float v; } f;
    struct { double *a; double v; } d;
    struct { void **a; void *v; } p;
    struct { uint8_t *a; uint8_t *v; size_t s; } b;
  } data;
} mod_log_w_entry_t;

typedef struct mod_log_w_set {          /* Write set */
  mod_log_w_entry_t *entries;           /* Array of entries */
  int nb_entries;                       /* Number of entries */
  int size;                             /* Size of array */
  int allocated;                        /* Memory blocks allocated */
} mod_log_w_set_t;

static int mod_log_key;
static int mod_log_initialized = 0;

/* ################################################################### *
 * STATIC
 * ################################################################### */

/*
 * Called by the CURRENT thread to obtain log entry.
 */
static inline mod_log_w_entry_t *get_entry(void)
{
  mod_log_w_set_t *ws;

  if (!mod_log_initialized) {
    fprintf(stderr, "Module mod_log not initialized\n");
    exit(1);
  }

  /* Store in undo log */
  ws = (mod_log_w_set_t *)stm_get_specific(mod_log_key);
  assert(ws != NULL);

  if (ws->nb_entries == ws->size) {
    /* Extend read set */
    ws->size = (ws->size < LW_SET_SIZE ? LW_SET_SIZE : ws->size * 2);
    ws->entries = (mod_log_w_entry_t *)xrealloc(ws->entries, ws->size * sizeof(mod_log_w_entry_t));
  }

  return &ws->entries[ws->nb_entries++];
}

/* ################################################################### *
 * FUNCTIONS
 * ################################################################### */

void stm_log(stm_word_t *addr)
{
  mod_log_w_entry_t *w = get_entry();

  w->type = TYPE_WORD;
  w->data.w.a = addr;
  w->data.w.v = *addr;
}

void stm_log_u8(uint8_t *addr)
{
  mod_log_w_entry_t *w = get_entry();

  w->type = TYPE_U8;
  w->data.u8.a = addr;
  w->data.u8.v = *addr;
}

void stm_log_u16(uint16_t *addr)
{
  mod_log_w_entry_t *w = get_entry();

  w->type = TYPE_U16;
  w->data.u16.a = addr;
  w->data.u16.v = *addr;
}

void stm_log_u32(uint32_t *addr)
{
  mod_log_w_entry_t *w = get_entry();

  w->type = TYPE_U32;
  w->data.u32.a = addr;
  w->data.u32.v = *addr;
}

void stm_log_u64(uint64_t *addr)
{
  mod_log_w_entry_t *w = get_entry();

  w->type = TYPE_U64;
  w->data.u64.a = addr;
  w->data.u64.v = *addr;
}

void stm_log_char(char *addr)
{
  mod_log_w_entry_t *w = get_entry();

  w->type = TYPE_CHAR;
  w->data.c.a = addr;
  w->data.c.v = *addr;
}

void stm_log_uchar(unsigned char *addr)
{
  mod_log_w_entry_t *w = get_entry();

  w->type = TYPE_UCHAR;
  w->data.uc.a = addr;
  w->data.uc.v = *addr;
}

void stm_log_short(short *addr)
{
  mod_log_w_entry_t *w = get_entry();

  w->type = TYPE_SHORT;
  w->data.s.a = addr;
  w->data.s.v = *addr;
}

void stm_log_ushort(unsigned short *addr)
{
  mod_log_w_entry_t *w = get_entry();

  w->type = TYPE_USHORT;
  w->data.us.a = addr;
  w->data.us.v = *addr;
}

void stm_log_int(int *addr)
{
  mod_log_w_entry_t *w = get_entry();

  w->type = TYPE_INT;
  w->data.i.a = addr;
  w->data.i.v = *addr;
}

void stm_log_uint(unsigned int *addr)
{
  mod_log_w_entry_t *w = get_entry();

  w->type = TYPE_UINT;
  w->data.ui.a = addr;
  w->data.ui.v = *addr;
}

void stm_log_long(long *addr)
{
  mod_log_w_entry_t *w = get_entry();

  w->type = TYPE_LONG;
  w->data.l.a = addr;
  w->data.l.v = *addr;
}

void stm_log_ulong(unsigned long *addr)
{
  mod_log_w_entry_t *w = get_entry();

  w->type = TYPE_ULONG;
  w->data.ul.a = addr;
  w->data.ul.v = *addr;
}

void stm_log_float(float *addr)
{
  mod_log_w_entry_t *w = get_entry();

  w->type = TYPE_FLOAT;
  w->data.f.a = addr;
  w->data.f.v = *addr;
}

void stm_log_double(double *addr)
{
  mod_log_w_entry_t *w = get_entry();

  w->type = TYPE_DOUBLE;
  w->data.d.a = addr;
  w->data.d.v = *addr;
}

void stm_log_ptr(void **addr)
{
  mod_log_w_entry_t *w = get_entry();

  w->type = TYPE_PTR;
  w->data.p.a = addr;
  w->data.p.v = *addr;
}

void stm_log_bytes(uint8_t *addr, size_t size)
{
  mod_log_w_entry_t *w = get_entry();

  w->type = TYPE_BYTES;
  w->data.b.a = addr;
  w->data.b.v = (uint8_t *)xmalloc(size);
  w->data.b.s = size;
  memcpy(w->data.b.v, addr, size);

  /* Remember we have allocated memory */
  ((mod_log_w_set_t *)stm_get_specific(mod_log_key))->allocated++;
}

/*
 * Called upon thread creation.
 */
static void mod_log_on_thread_init(void *arg)
{
  mod_log_w_set_t *ws;

  ws = (mod_log_w_set_t *)xmalloc(sizeof(mod_log_w_set_t));
  ws->entries = NULL;
  ws->nb_entries = ws->size = ws->allocated = 0;

  stm_set_specific(mod_log_key, ws);
}

/*
 * Called upon thread deletion.
 */
static void mod_log_on_thread_exit(void *arg)
{
  mod_log_w_set_t *ws;

  ws = (mod_log_w_set_t *)stm_get_specific(mod_log_key);
  assert(ws != NULL);

  xfree(ws->entries);
  xfree(ws);
}

/*
 * Called upon transaction commit.
 */
static void mod_log_on_commit(void *arg)
{
  mod_log_w_set_t *ws;
  mod_log_w_entry_t *w;

  ws = (mod_log_w_set_t *)stm_get_specific(mod_log_key);
  assert(ws != NULL);

  /* Free memory */
  if (ws->allocated > 0) {
    w = ws->entries;
    do {
      assert(w < &ws->entries[ws->nb_entries]);
      if (w->type == TYPE_BYTES) {
        xfree(w->data.b.v);
        ws->allocated--;
      }
      w++;
    } while (ws->allocated > 0);
  }
  /* Erase undo log */
  ws->nb_entries = 0;
}

/*
 * Called upon transaction abort.
 */
static void mod_log_on_abort(void *arg)
{
  mod_log_w_set_t *ws;
  mod_log_w_entry_t *w;

  ws = (mod_log_w_set_t *)stm_get_specific(mod_log_key);
  assert(ws != NULL);

  if (ws->nb_entries > 0) {
    /* Apply undo log in reverse order */
    w = &ws->entries[ws->nb_entries - 1];
    do {
      switch (w->type) {
       case TYPE_WORD:
         *w->data.w.a = w->data.w.v;
         break;
       case TYPE_U8:
         *w->data.u8.a = w->data.u8.v;
         break;
       case TYPE_U16:
         *w->data.u16.a = w->data.u16.v;
         break;
       case TYPE_U32:
         *w->data.u32.a = w->data.u32.v;
         break;
       case TYPE_U64:
         *w->data.u64.a = w->data.u64.v;
         break;
       case TYPE_CHAR:
         *w->data.c.a = w->data.c.v;
         break;
       case TYPE_UCHAR:
         *w->data.uc.a = w->data.uc.v;
         break;
       case TYPE_SHORT:
         *w->data.s.a = w->data.s.v;
         break;
       case TYPE_USHORT:
         *w->data.us.a = w->data.us.v;
         break;
       case TYPE_INT:
         *w->data.i.a = w->data.i.v;
         break;
       case TYPE_UINT:
         *w->data.ui.a = w->data.ui.v;
         break;
       case TYPE_LONG:
         *w->data.l.a = w->data.l.v;
         break;
       case TYPE_ULONG:
         *w->data.ul.a = w->data.ul.v;
         break;
       case TYPE_FLOAT:
         *w->data.f.a = w->data.f.v;
         break;
       case TYPE_DOUBLE:
         *w->data.d.a = w->data.d.v;
         break;
       case TYPE_PTR:
         *w->data.p.a = w->data.p.v;
         break;
       case TYPE_BYTES:
         memcpy(w->data.b.a, w->data.b.v, w->data.b.s);
         xfree(w->data.b.v);
         ws->allocated--;
         break;
       default:
         fprintf(stderr, "Unexpected entry in undo log\n");
         abort();
         exit(1);
      }
    } while (--w >= ws->entries);
    /* Erase undo log */
    ws->nb_entries = 0;
  }
  assert(ws->allocated == 0);
}

/*
 * Initialize module.
 */
void mod_log_init(void)
{
  if (mod_log_initialized)
    return;

  if (!stm_register(mod_log_on_thread_init, mod_log_on_thread_exit, NULL, NULL, mod_log_on_commit, mod_log_on_abort, NULL)) {
    fprintf(stderr, "Cannot register callbacks\n");
    exit(1);
  }
  mod_log_key = stm_create_specific();
  if (mod_log_key < 0) {
    fprintf(stderr, "Cannot create specific key\n");
    exit(1);
  }
  mod_log_initialized = 1;
}


================================================
FILE: stms/tinystm/src/mod_order.c
================================================
/*
 * File:
 *   mod_order.c
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * Description:
 *   Module to force transactions to commit in order.
 *
 * Copyright (c) 2007-2014.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * This program has a dual license and can also be distributed
 * under the terms of the MIT license.
 */

#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <stm.h>
#include "atomic.h"
#include "utils.h"
#include "mod_order.h"

#define KILL_SELF                      0x00
#define KILL_OTHER                     0x01

/* XXX Maybe these two could be in the same cacheline. */
ALIGNED static stm_word_t mod_order_ts_next = 0;
ALIGNED static stm_word_t mod_order_ts_commit = 0;
static int mod_order_key;
static int mod_order_initialized = 0;

static void mod_order_on_start(void *arg)
{
  stm_word_t ts;
  /* Get a timestamp for commit */
  ts = ATOMIC_FETCH_INC_FULL(&mod_order_ts_next);
  stm_set_specific(mod_order_key, (void *)ts);
}

static void mod_order_on_precommit(void *arg)
{
  stm_word_t my_ts, current_ts;
  my_ts = (stm_word_t)stm_get_specific(mod_order_key);
  /* Wait its turn... */
  do {
    current_ts = ATOMIC_LOAD(&mod_order_ts_commit);
    /* Check that we are not killed to keep the liveness, the transaction will
     * abort before to commit. Note that if the kill feature is not present, the
     * transaction must abort if it is not its turn to guarantee progress. */
    if (stm_killed())
      return;
  } while (current_ts != my_ts);
}

static void mod_order_on_commit(void *arg)
{
  /* Release next transaction to commit */
  ATOMIC_FETCH_INC_FULL(&mod_order_ts_commit);
}

static int mod_order_cm(struct stm_tx *tx, struct stm_tx *other_tx, int conflict)
{
  stm_word_t my_order = (stm_word_t)stm_get_specific_tx(tx, mod_order_key);
  stm_word_t other_order = (stm_word_t)stm_get_specific_tx(other_tx, mod_order_key);

  if (my_order < other_order)
    return KILL_OTHER;

  return KILL_SELF;
}

/*
 * Initialize module.
 */
void mod_order_init(void)
{
  if (mod_order_initialized)
    return;
#if CM == CM_MODULAR
  if (!stm_register(NULL, NULL, mod_order_on_start, mod_order_on_precommit, mod_order_on_commit, NULL, NULL)) {
    fprintf(stderr, "Could not set callbacks for module 'mod_order'. Exiting.\n");
    goto err;
  }
  if (stm_set_parameter("cm_function", mod_order_cm) == 0) {
    fprintf(stderr, "Could not set contention manager for module 'mod_order'. Exiting.\n");
    goto err;
  }
  mod_order_key = stm_create_specific();
  if (mod_order_key < 0) {
    fprintf(stderr, "Cannot create specific key\n");
    goto err;
  }
  mod_order_initialized = 1;
  return;
 err:
#else /* CM != CM_MODULAR */
  fprintf(stderr, "The 'mod_order' module requires CM_MODULAR.\n");
#endif /* CM != CM_MODULAR */
  exit(1);
}


================================================
FILE: stms/tinystm/src/mod_print.c
================================================
/*
 * File:
 *   mod_print.c
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * Description:
 *   Module to test callbacks.
 *
 * Copyright (c) 2007-2014.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * This program has a dual license and can also be distributed
 * under the terms of the MIT license.
 */

#include "mod_print.h"

#include "stm.h"

/*
 * Called upon thread creation.
 */
static void mod_print_on_thread_init(void *arg)
{
  printf("==> on_thread_init()\n");
  fflush(NULL);
}

/*
 * Called upon thread deletion.
 */
static void mod_print_on_thread_exit(void *arg)
{
  printf("==> on_thread_exit()\n");
  fflush(NULL);
}

/*
 * Called upon transaction start.
 */
static void mod_print_on_start(void *arg)
{
  printf("==> on_start()\n");
  fflush(NULL);
}

/*
 * Called before transaction try to commit.
 */
static void mod_print_on_precommit(void *arg)
{
  printf("==> on_precommit()\n");
  fflush(NULL);
}

/*
 * Called upon transaction commit.
 */
static void mod_print_on_commit(void *arg)
{
  printf("==> on_commit()\n");
  fflush(NULL);
}

/*
 * Called upon transaction abort.
 */
static void mod_print_on_abort(void *arg)
{
  printf("==> on_abort()\n");
  fflush(NULL);
}

/*
 * Initialize module.
 */
void mod_print_init(void)
{
  if (!stm_register(mod_print_on_thread_init, mod_print_on_thread_exit, mod_print_on_start, mod_print_on_precommit, mod_print_on_commit, mod_print_on_abort, NULL)) {
    fprintf(stderr, "Cannot register callbacks\n");
    exit(1);
  }
}


================================================
FILE: stms/tinystm/src/mod_stats.c
================================================
/*
 * File:
 *   mod_stats.c
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * Description:
 *   Module for gathering global statistics about transactions.
 *
 * Copyright (c) 2007-2014.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * This program has a dual license and can also be distributed
 * under the terms of the MIT license.
 */

#include <assert.h>
#include <limits.h>
#include <stdio.h>
#include <string.h>

#include "mod_stats.h"

#include "atomic.h"
#include "stm.h"
#include "utils.h"

/* ################################################################### *
 * TYPES
 * ################################################################### */

typedef struct mod_stats_data {         /* Transaction statistics */
  unsigned long commits;                /* Total number of commits (cumulative) */
  unsigned long retries;                /* Number of consecutive aborts of current transaction (retries) */
  unsigned long retries_min;            /* Minimum number of consecutive aborts */
  unsigned long retries_max;            /* Maximum number of consecutive aborts */
  unsigned long retries_acc;            /* Total number of aborts (cumulative) */
  unsigned long retries_cnt;            /* Number of samples for cumulative aborts */
} mod_stats_data_t;

static int mod_stats_key;
static int mod_stats_initialized = 0;

static mod_stats_data_t mod_stats_global = { 0, 0, ULONG_MAX, 0, 0, 0 };

/* ################################################################### *
 * FUNCTIONS
 * ################################################################### */

/*
 * Return aggregate statistics about transactions.
 */
int stm_get_global_stats(const char *name, void *val)
{
  if (!mod_stats_initialized) {
    fprintf(stderr, "Module mod_stats not initialized\n");
    exit(1);
  }

  if (strcmp("global_nb_commits", name) == 0) {
    *(unsigned long *)val = mod_stats_global.commits;
    return 1;
  }
  if (strcmp("global_nb_aborts", name) == 0) {
    *(unsigned long *)val = mod_stats_global.retries_acc;
    return 1;
  }
  if (strcmp("global_max_retries", name) == 0) {
    *(unsigned long *)val = mod_stats_global.retries_max;
    return 1;
  }

  return 0;
}

/*
 * Return statistics about current thread.
 */
int stm_get_local_stats(const char *name, void *val)
{
  mod_stats_data_t *stats;

  if (!mod_stats_initialized) {
    fprintf(stderr, "Module mod_stats not initialized\n");
    exit(1);
  }

  stats = (mod_stats_data_t *)stm_get_specific(mod_stats_key);
  assert(stats != NULL);

  if (strcmp("nb_commits", name) == 0) {
    *(unsigned long *)val = stats->commits;
    return 1;
  }
  if (strcmp("nb_aborts", name) == 0) {
    *(unsigned long *)val = stats->retries_acc;
    return 1;
  }
  if (strcmp("nb_retries_avg", name) == 0) {
    *(double *)val = (double)stats->retries_acc / stats->retries_cnt;
    return 1;
  }
  if (strcmp("nb_retries_min", name) == 0) {
    *(unsigned long *)val = stats->retries_min;
    return 1;
  }
  if (strcmp("nb_retries_max", name) == 0) {
    *(unsigned long *)val = stats->retries_max;
    return 1;
  }

  return 0;
}

/*
 * Called upon thread creation.
 */
static void mod_stats_on_thread_init(void *arg)
{
  mod_stats_data_t *stats;

  stats = (mod_stats_data_t *)xmalloc(sizeof(mod_stats_data_t));
  stats->commits = 0;
  stats->retries = 0;
  stats->retries_acc = 0;
  stats->retries_cnt = 0;
  stats->retries_min = ULONG_MAX;
  stats->retries_max = 0;

  stm_set_specific(mod_stats_key, stats);
}

/*
 * Called upon thread deletion.
 */
static void mod_stats_on_thread_exit(void *arg)
{
  mod_stats_data_t *stats;
  unsigned long max, min;

  stats = (mod_stats_data_t *)stm_get_specific(mod_stats_key);
  assert(stats != NULL);

  ATOMIC_FETCH_ADD_FULL(&mod_stats_global.commits, stats->commits);
  ATOMIC_FETCH_ADD_FULL(&mod_stats_global.retries_cnt, stats->retries_cnt);
  ATOMIC_FETCH_ADD_FULL(&mod_stats_global.retries_acc, stats->retries_acc);
retry_max:
  max = ATOMIC_LOAD(&mod_stats_global.retries_max);
  if (stats->retries_max > max) {
    if (ATOMIC_CAS_FULL(&mod_stats_global.retries_max, max, stats->retries_max) == 0)
      goto retry_max;
  }
retry_min:
  min = ATOMIC_LOAD(&mod_stats_global.retries_min);
  if (stats->retries_min < min) {
    if (ATOMIC_CAS_FULL(&mod_stats_global.retries_min, min, stats->retries_min) == 0)
      goto retry_min;
  }

  xfree(stats);
}

/*
 * Called upon transaction commit.
 */
static void mod_stats_on_commit(void *arg)
{
  mod_stats_data_t *stats;

  stats = (mod_stats_data_t *)stm_get_specific(mod_stats_key);
  assert(stats != NULL);
  stats->commits++;
  stats->retries_acc += stats->retries;
  stats->retries_cnt++;
  if (stats->retries_min > stats->retries)
    stats->retries_min = stats->retries;
  if (stats->retries_max < stats->retries)
    stats->retries_max = stats->retries;
  stats->retries = 0;
}

/*
 * Called upon transaction abort.
 */
static void mod_stats_on_abort(void *arg)
{
  mod_stats_data_t *stats;

  stats = (mod_stats_data_t *)stm_get_specific(mod_stats_key);
  assert(stats != NULL);

  stats->retries++;
}

/*
 * Initialize module.
 */
void mod_stats_init(void)
{
  if (mod_stats_initialized)
    return;

  if (!stm_register(mod_stats_on_thread_init, mod_stats_on_thread_exit, NULL, NULL, mod_stats_on_commit, mod_stats_on_abort, NULL)) {
    fprintf(stderr, "Cannot register callbacks\n");
    exit(1);
  }
  mod_stats_key = stm_create_specific();
  if (mod_stats_key < 0) {
    fprintf(stderr, "Cannot create specific key\n");
    exit(1);
  }
  mod_stats_initialized = 1;
}


================================================
FILE: stms/tinystm/src/stm.c
================================================
/*
 * File:
 *   stm.c
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * Description:
 *   STM functions.
 *
 * Copyright (c) 2007-2014.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * This program has a dual license and can also be distributed
 * under the terms of the MIT license.
 */

#include <assert.h>
#include <signal.h>
#include <stdio.h>
#include <string.h>

#include <pthread.h>
#include <sched.h>

#include "stm.h"
#include "stm_internal.h"

#include "utils.h"
#include "atomic.h"
#include "gc.h"

/* ################################################################### *
 * DEFINES
 * ################################################################### */

/* Indexes are defined in stm_internal.h  */
static const char *design_names[] = {
  /* 0 */ "WRITE-BACK (ETL)",
  /* 1 */ "WRITE-BACK (CTL)",
  /* 2 */ "WRITE-THROUGH",
  /* 3 */ "WRITE-MODULAR"
};

static const char *cm_names[] = {
  /* 0 */ "SUICIDE",
  /* 1 */ "DELAY",
  /* 2 */ "BACKOFF",
  /* 3 */ "MODULAR"
};

/* Global variables */
global_t _tinystm =
    { .nb_specific = 0
    , .initialized = 0
#ifdef IRREVOCABLE_ENABLED
    , .irrevocable = 0
#endif /* IRREVOCABLE_ENABLED */
    };

/* ################################################################### *
 * TYPES
 * ################################################################### */


/*
 * Transaction nesting is supported in a minimalist way (flat nesting):
 * - When a transaction is started in the context of another
 *   transaction, we simply increment a nesting counter but do not
 *   actually start a new transaction.
 * - The environment to be used for setjmp/longjmp is only returned when
 *   no transaction is active so that it is not overwritten by nested
 *   transactions. This allows for composability as the caller does not
 *   need to know whether it executes inside another transaction.
 * - The commit of a nested transaction simply decrements the nesting
 *   counter. Only the commit of the top-level transaction will actually
 *   carry through updates to shared memory.
 * - An abort of a nested transaction will rollback the top-level
 *   transaction and reset the nesting counter. The call to longjmp will
 *   restart execution before the top-level transaction.
 * Using nested transactions without setjmp/longjmp is not recommended
 * as one would need to explicitly jump back outside of the top-level
 * transaction upon abort of a nested transaction. This breaks
 * composability.
 */

/*
 * Reading from the previous version of locked addresses is implemented
 * by peeking into the write set of the transaction that owns the
 * lock. Each transaction has a unique identifier, updated even upon
 * retry. A special "commit" bit of this identifier is set upon commit,
 * right before writing the values from the redo log to shared memory. A
 * transaction can read a locked address if the identifier of the owner
 * does not change between before and after reading the value and
 * version, and it does not have the commit bit set.
 */

/* ################################################################### *
 * THREAD-LOCAL
 * ################################################################### */

#if defined(TLS_POSIX) || defined(TLS_DARWIN)
/* TODO this may lead to false sharing. */
/* TODO this could be renamed with tinystm prefix */
pthread_key_t thread_tx;
pthread_key_t thread_gc;
#elif defined(TLS_COMPILER)
__thread stm_tx_t* thread_tx = NULL;
__thread long thread_gc = 0;
#endif /* defined(TLS_COMPILER) */

/* ################################################################### *
 * STATIC
 * ################################################################### */

#if CM == CM_MODULAR
/*
 * Kill other.
 */
static int
cm_aggressive(struct stm_tx *me, struct stm_tx *other, int conflict)
{
  return KILL_OTHER;
}

/*
 * Kill self.
 */
static int
cm_suicide(struct stm_tx *me, struct stm_tx *other, int conflict)
{
  return KILL_SELF;
}

/*
 * Kill self and wait before restart.
 */
static int
cm_delay(struct stm_tx *me, struct stm_tx *other, int conflict)
{
  return KILL_SELF | DELAY_RESTART;
}

/*
 * Oldest transaction has priority.
 */
static int
cm_timestamp(struct stm_tx *me, struct stm_tx *other, int conflict)
{
  if (me->timestamp < other->timestamp)
    return KILL_OTHER;
  if (me->timestamp == other->timestamp && (uintptr_t)me < (uintptr_t)other)
    return KILL_OTHER;
  return KILL_SELF | DELAY_RESTART;
}

/*
 * Transaction with more work done has priority.
 */
static int
cm_karma(struct stm_tx *me, struct stm_tx *other, int conflict)
{
  unsigned int me_work, other_work;

  me_work = (me->w_set.nb_entries << 1) + me->r_set.nb_entries;
  other_work = (other->w_set.nb_entries << 1) + other->r_set.nb_entries;

  if (me_work < other_work)
    return KILL_OTHER;
  if (me_work == other_work && (uintptr_t)me < (uintptr_t)other)
    return KILL_OTHER;
  return KILL_SELF;
}

struct {
  const char *name;
  int (*f)(stm_tx_t *, stm_tx_t *, int);
} cms[] = {
  { "aggressive", cm_aggressive },
  { "suicide", cm_suicide },
  { "delay", cm_delay },
  { "timestamp", cm_timestamp },
  { "karma", cm_karma },
  { NULL, NULL }
};
#endif /* CM == CM_MODULAR */

#ifdef SIGNAL_HANDLER
/*
 * Catch signal (to emulate non-faulting load).
 */
static void
signal_catcher(int sig)
{
  sigset_t block_signal;
  stm_tx_t *tx = tls_get_tx();

  /* A fault might only occur upon a load concurrent with a free (read-after-free) */
  PRINT_DEBUG("Caught signal: %d\n", sig);

  /* TODO: TX_KILLED should be also allowed */
  if (tx == NULL || tx->attr.no_retry || GET_STATUS(tx->status) != TX_ACTIVE) {
    /* There is not much we can do: execution will restart at faulty load */
    fprintf(stderr, "Error: invalid memory accessed and no longjmp destination\n");
    exit(1);
  }

  /* Unblock the signal since there is no return to signal handler */
  sigemptyset(&block_signal);
  sigaddset(&block_signal, sig);
  pthread_sigmask(SIG_UNBLOCK, &block_signal, NULL);

  /* Will cause a longjmp */
  stm_rollback(tx, STM_ABORT_SIGNAL);
}
#endif /* SIGNAL_HANDLER */

/* ################################################################### *
 * STM FUNCTIONS
 * ################################################################### */

/*
 * Called once (from main) to initialize STM infrastructure.
 */
_CALLCONV void
stm_init(void)
{
#if CM == CM_MODULAR
  char *s;
#endif /* CM == CM_MODULAR */
#ifdef SIGNAL_HANDLER
  struct sigaction act;
#endif /* SIGNAL_HANDLER */

  PRINT_DEBUG("==> stm_init()\n");

  if (_tinystm.initialized)
    return;

  PRINT_DEBUG("\tsizeof(word)=%d\n", (int)sizeof(stm_word_t));

  PRINT_DEBUG("\tVERSION_MAX=0x%lx\n", (unsigned long)VERSION_MAX);

  COMPILE_TIME_ASSERT(sizeof(stm_word_t) == sizeof(void *));
  COMPILE_TIME_ASSERT(sizeof(stm_word_t) == sizeof(atomic_t));

#ifdef EPOCH_GC
  gc_init(stm_get_clock);
#endif /* EPOCH_GC */

#if CM == CM_MODULAR
  s = getenv(VR_THRESHOLD);
  if (s != NULL)
    _tinystm.vr_threshold = (int)strtol(s, NULL, 10);
  else
    _tinystm.vr_threshold = VR_THRESHOLD_DEFAULT;
  PRINT_DEBUG("\tVR_THRESHOLD=%d\n", _tinystm.vr_threshold);
#endif /* CM == CM_MODULAR */

  /* Set locks and clock but should be already to 0 */
  memset((void *)_tinystm.locks, 0, LOCK_ARRAY_SIZE * sizeof(stm_word_t));
  CLOCK = 0;

  stm_quiesce_init();

  tls_init();

#ifdef SIGNAL_HANDLER
  if (getenv(NO_SIGNAL_HANDLER) == NULL) {
    /* Catch signals for non-faulting load */
    act.sa_handler = signal_catcher;
    act.sa_flags = 0;
    sigemptyset(&act.sa_mask);
    if (sigaction(SIGBUS, &act, NULL) < 0 || sigaction(SIGSEGV, &act, NULL) < 0) {
      perror("sigaction");
      exit(1);
    }
  }
#endif /* SIGNAL_HANDLER */
  _tinystm.initialized = 1;
}

/*
 * Called once (from main) to clean up STM infrastructure.
 */
_CALLCONV void
stm_exit(void)
{
  PRINT_DEBUG("==> stm_exit()\n");

  if (!_tinystm.initialized)
    return;

  tls_exit();
  stm_quiesce_exit();

#ifdef EPOCH_GC
  gc_exit();
#endif /* EPOCH_GC */

  _tinystm.initialized = 0;
}

/*
 * Called by the CURRENT thread to initialize thread-local STM data.
 */
_CALLCONV stm_tx_t *
stm_init_thread(void)
{
  return int_stm_init_thread();
}

/*
 * Called by the CURRENT thread to cleanup thread-local STM data.
 */
_CALLCONV void
stm_exit_thread(void)
{
  TX_GET;
  int_stm_exit_thread(tx);
}

_CALLCONV void
stm_exit_thread_tx(stm_tx_t *tx)
{
  int_stm_exit_thread(tx);
}

/*
 * Called by the CURRENT thread to start a transaction.
 */
_CALLCONV sigjmp_buf *
stm_start(stm_tx_attr_t attr)
{
  TX_GET;
  return int_stm_start(tx, attr);
}

_CALLCONV sigjmp_buf *
stm_start_tx(stm_tx_t *tx, stm_tx_attr_t attr)
{
  return int_stm_start(tx, attr);
}

/*
 * Called by the CURRENT thread to commit a transaction.
 */
_CALLCONV int
stm_commit(void)
{
  TX_GET;
  return int_stm_commit(tx);
}

_CALLCONV int
stm_commit_tx(stm_tx_t *tx)
{
  return int_stm_commit(tx);
}

/*
 * Called by the CURRENT thread to abort a transaction.
 */
_CALLCONV void
stm_abort(int reason)
{
  TX_GET;
  stm_rollback(tx, reason | STM_ABORT_EXPLICIT);
}

_CALLCONV void
stm_abort_tx(stm_tx_t *tx, int reason)
{
  stm_rollback(tx, reason | STM_ABORT_EXPLICIT);
}

/*
 * Called by the CURRENT thread to load a word-sized value.
 */
_CALLCONV ALIGNED stm_word_t
stm_load(volatile stm_word_t *addr)
{
  TX_GET;
  return int_stm_load(tx, addr);
}

_CALLCONV stm_word_t
stm_load_tx(stm_tx_t *tx, volatile stm_word_t *addr)
{
  return int_stm_load(tx, addr);
}

/*
 * Called by the CURRENT thread to store a word-sized value.
 */
_CALLCONV ALIGNED void
stm_store(volatile stm_word_t *addr, stm_word_t value)
{
  TX_GET;
  int_stm_store(tx, addr, value);
}

_CALLCONV void
stm_store_tx(stm_tx_t *tx, volatile stm_word_t *addr, stm_word_t value)
{
  int_stm_store(tx, addr, value);
}

/*
 * Called by the CURRENT thread to store part of a word-sized value.
 */
_CALLCONV ALIGNED void
stm_store2(volatile stm_word_t *addr, stm_word_t value, stm_word_t mask)
{
  TX_GET;
  int_stm_store2(tx, addr, value, mask);
}

_CALLCONV void
stm_store2_tx(stm_tx_t *tx, volatile stm_word_t *addr, stm_word_t value, stm_word_t mask)
{
  int_stm_store2(tx, addr, value, mask);
}

/*
 * Called by the CURRENT thread to inquire about the status of a transaction.
 */
_CALLCONV int
stm_active(void)
{
  TX_GET;
  return int_stm_active(tx);
}

_CALLCONV int
stm_active_tx(stm_tx_t *tx)
{
  return int_stm_active(tx);
}

/*
 * Called by the CURRENT thread to inquire about the status of a transaction.
 */
_CALLCONV int
stm_aborted(void)
{
  TX_GET;
  return int_stm_aborted(tx);
}

_CALLCONV int
stm_aborted_tx(stm_tx_t *tx)
{
  return int_stm_aborted(tx);
}

/*
 * Called by the CURRENT thread to inquire about the status of a transaction.
 */
_CALLCONV int
stm_irrevocable(void)
{
  TX_GET;
  return int_stm_irrevocable(tx);
}

_CALLCONV int
stm_irrevocable_tx(stm_tx_t *tx)
{
  return int_stm_irrevocable(tx);
}

/*
 * Called by the CURRENT thread to inquire about the status of a transaction.
 */
_CALLCONV int
stm_killed(void)
{
  TX_GET;
  return int_stm_killed(tx);
}

_CALLCONV int
stm_killed_tx(stm_tx_t *tx)
{
  return int_stm_killed(tx);
}

/*
 * Called by the CURRENT thread to obtain an environment for setjmp/longjmp.
 */
_CALLCONV sigjmp_buf *
stm_get_env(void)
{
  TX_GET;
  return int_stm_get_env(tx);
}

_CALLCONV sigjmp_buf *
stm_get_env_tx(stm_tx_t *tx)
{
  return int_stm_get_env(tx);
}

/*
 * Get transaction attributes.
 */
_CALLCONV stm_tx_attr_t
stm_get_attributes(void)
{
  TX_GET;
  assert (tx != NULL);
  return tx->attr;
}

/*
 * Get transaction attributes from a specifc transaction.
 */
_CALLCONV stm_tx_attr_t
stm_get_attributes_tx(struct stm_tx *tx)
{
  assert (tx != NULL);
  return tx->attr;
}

/*
 * Return statistics about a thread/transaction.
 */
_CALLCONV int
stm_get_stats(const char *name, void *val)
{
  TX_GET;
  return int_stm_get_stats(tx, name, val);
}

_CALLCONV int
stm_get_stats_tx(stm_tx_t *tx, const char *name, void *val)
{
  return int_stm_get_stats(tx, name, val);
}

/*
 * Return STM parameters.
 */
_CALLCONV int
stm_get_parameter(const char *name, void *val)
{
  if (strcmp("contention_manager", name) == 0) {
    *(const char **)val = cm_names[CM];
    return 1;
  }
  if (strcmp("design", name) == 0) {
    *(const char **)val = design_names[DESIGN];
    return 1;
  }
  if (strcmp("initial_rw_set_size", name) == 0) {
    *(int *)val = RW_SET_SIZE;
    return 1;
  }
#if CM == CM_BACKOFF
  if (strcmp("min_backoff", name) == 0) {
    *(unsigned long *)val = MIN_BACKOFF;
    return 1;
  }
  if (strcmp("max_backoff", name) == 0) {
    *(unsigned long *)val = MAX_BACKOFF;
    return 1;
  }
#endif /* CM == CM_BACKOFF */
#if CM == CM_MODULAR
  if (strcmp("vr_threshold", name) == 0) {
    *(int *)val = _tinystm.vr_threshold;
    return 1;
  }
#endif /* CM == CM_MODULAR */
#ifdef COMPILE_FLAGS
  if (strcmp("compile_flags", name) == 0) {
    *(const char **)val = XSTR(COMPILE_FLAGS);
    return 1;
  }
#endif /* COMPILE_FLAGS */
  return 0;
}

/*
 * Set STM parameters.
 */
_CALLCONV int
stm_set_parameter(const char *name, void *val)
{
#if CM == CM_MODULAR
  int i;

  if (strcmp("cm_policy", name) == 0) {
    for (i = 0; cms[i].name != NULL; i++) {
      if (strcasecmp(cms[i].name, (const char *)val) == 0) {
        _tinystm.contention_manager = cms[i].f;
        return 1;
      }
    }
    return 0;
  }
  if (strcmp("cm_function", name) == 0) {
    _tinystm.contention_manager = (int (*)(stm_tx_t *, stm_tx_t *, int))val;
    return 1;
  }
  if (strcmp("vr_threshold", name) == 0) {
    _tinystm.vr_threshold = *(int *)val;
    return 1;
  }
#endif /* CM == CM_MODULAR */
  return 0;
}

/*
 * Create transaction-specific data (return -1 on error).
 */
_CALLCONV int
stm_create_specific(void)
{
  if (_tinystm.nb_specific >= MAX_SPECIFIC) {
    fprintf(stderr, "Error: maximum number of specific slots reached\n");
    return -1;
  }
  return _tinystm.nb_specific++;
}

/*
 * Store transaction-specific data.
 */
_CALLCONV void
stm_set_specific(int key, void *data)
{
  TX_GET;
  int_stm_set_specific(tx, key, data);
}

/*
 * Store transaction-specific data to a specifc transaction.
 */
_CALLCONV void
stm_set_specific_tx(stm_tx_t *tx, int key, void *data)
{
  int_stm_set_specific(tx, key, data);
}


/*
 * Fetch transaction-specific data.
 */
_CALLCONV void *
stm_get_specific(int key)
{
  TX_GET;
  return int_stm_get_specific(tx, key);
}

/*
 * Fetch transaction-specific data from a specific transaction.
 */
_CALLCONV void *
stm_get_specific_tx(stm_tx_t *tx, int key)
{
  return int_stm_get_specific(tx, key);
}

/*
 * Register callbacks for an external module (must be called before creating transactions).
 */
_CALLCONV int
stm_register(void (*on_thread_init)(void *arg),
             void (*on_thread_exit)(void *arg),
             void (*on_start)(void *arg),
             void (*on_precommit)(void *arg),
             void (*on_commit)(void *arg),
             void (*on_abort)(void *arg),
             void *arg)
{
  if ((on_thread_init != NULL && _tinystm.nb_init_cb >= MAX_CB) ||
      (on_thread_exit != NULL && _tinystm.nb_exit_cb >= MAX_CB) ||
      (on_start != NULL && _tinystm.nb_start_cb >= MAX_CB) ||
      (on_precommit != NULL && _tinystm.nb_precommit_cb >= MAX_CB) ||
      (on_commit != NULL && _tinystm.nb_commit_cb >= MAX_CB) ||
      (on_abort != NULL && _tinystm.nb_abort_cb >= MAX_CB)) {
    fprintf(stderr, "Error: maximum number of modules reached\n");
    return 0;
  }
  /* New callback */
  if (on_thread_init != NULL) {
    _tinystm.init_cb[_tinystm.nb_init_cb].f = on_thread_init;
    _tinystm.init_cb[_tinystm.nb_init_cb++].arg = arg;
  }
  /* Delete callback */
  if (on_thread_exit != NULL) {
    _tinystm.exit_cb[_tinystm.nb_exit_cb].f = on_thread_exit;
    _tinystm.exit_cb[_tinystm.nb_exit_cb++].arg = arg;
  }
  /* Start callback */
  if (on_start != NULL) {
    _tinystm.start_cb[_tinystm.nb_start_cb].f = on_start;
    _tinystm.start_cb[_tinystm.nb_start_cb++].arg = arg;
  }
  /* Pre-commit callback */
  if (on_precommit != NULL) {
    _tinystm.precommit_cb[_tinystm.nb_precommit_cb].f = on_precommit;
    _tinystm.precommit_cb[_tinystm.nb_precommit_cb++].arg = arg;
  }
  /* Commit callback */
  if (on_commit != NULL) {
    _tinystm.commit_cb[_tinystm.nb_commit_cb].f = on_commit;
    _tinystm.commit_cb[_tinystm.nb_commit_cb++].arg = arg;
  }
  /* Abort callback */
  if (on_abort != NULL) {
    _tinystm.abort_cb[_tinystm.nb_abort_cb].f = on_abort;
    _tinystm.abort_cb[_tinystm.nb_abort_cb++].arg = arg;
  }

  return 1;
}

/*
 * Called by the CURRENT thread to load a word-sized value in a unit transaction.
 */
_CALLCONV stm_word_t
stm_unit_load(volatile stm_word_t *addr, stm_word_t *timestamp)
{
#ifdef UNIT_TX
  volatile stm_word_t *lock;
  stm_word_t l, l2, value;

  PRINT_DEBUG2("==> stm_unit_load(a=%p)\n", addr);

  /* Get reference to lock */
  lock = GET_LOCK(addr);

  /* Read lock, value, lock */
 restart:
  l = ATOMIC_LOAD_ACQ(lock);
 restart_no_load:
  if (LOCK_GET_OWNED(l)) {
    /* Locked: wait until lock is free */
#ifdef WAIT_YIELD
    sched_yield();
#endif /* WAIT_YIELD */
    goto restart;
  }
  /* Not locked */
  value = ATOMIC_LOAD_ACQ(addr);
  l2 = ATOMIC_LOAD_ACQ(lock);
  if (l != l2) {
    l = l2;
    goto restart_no_load;
  }

  if (timestamp != NULL)
    *timestamp = LOCK_GET_TIMESTAMP(l);

  return value;
#else /* ! UNIT_TX */
  fprintf(stderr, "Unit transaction is not enabled\n");
  exit(-1);
  return 1;
#endif /* ! UNIT_TX */
}

/*
 * Store a word-sized value in a unit transaction.
 */
static INLINE int
stm_unit_write(volatile stm_word_t *addr, stm_word_t value, stm_word_t mask, stm_word_t *timestamp)
{
#ifdef UNIT_TX
  volatile stm_word_t *lock;
  stm_word_t l;

  PRINT_DEBUG2("==> stm_unit_write(a=%p,d=%p-%lu,m=0x%lx)\n",
               addr, (void *)value, (unsigned long)value, (unsigned long)mask);

  /* Get reference to lock */
  lock = GET_LOCK(addr);

  /* Try to acquire lock */
 restart:
  l = ATOMIC_LOAD_ACQ(lock);
  if (LOCK_GET_OWNED(l)) {
    /* Locked: wait until lock is free */
#ifdef WAIT_YIELD
    sched_yield();
#endif /* WAIT_YIELD */
    goto restart;
  }
  /* Not locked */
  if (timestamp != NULL && LOCK_GET_TIMESTAMP(l) > *timestamp) {
    /* Return current timestamp */
    *timestamp = LOCK_GET_TIMESTAMP(l);
    return 0;
  }
  /* TODO: would need to store thread ID to be able to kill it (for wait freedom) */
  if (ATOMIC_CAS_FULL(lock, l, LOCK_UNIT) == 0)
    goto restart;
  ATOMIC_STORE(addr, value);
  /* Update timestamp with newer value (may exceed VERSION_MAX by up to MAX_THREADS) */
  l = FETCH_INC_CLOCK + 1;
  if (timestamp != NULL)
    *timestamp = l;
  /* Make sure that lock release becomes visible */
  ATOMIC_STORE_REL(lock, LOCK_SET_TIMESTAMP(l));
  if (unlikely(l >= VERSION_MAX)) {
    /* Block all transactions and reset clock (current thread is not in active transaction) */
    stm_quiesce_barrier(NULL, rollover_clock, NULL);
  }
  return 1;
#else /* ! UNIT_TX */
  fprintf(stderr, "Unit transaction is not enabled\n");
  exit(-1);
  return 1;
#endif /* ! UNIT_TX */
}

/*
 * Called by the CURRENT thread to store a word-sized value in a unit transaction.
 */
_CALLCONV int
stm_unit_store(volatile stm_word_t *addr, stm_word_t value, stm_word_t *timestamp)
{
  return stm_unit_write(addr, value, ~(stm_word_t)0, timestamp);
}

/*
 * Called by the CURRENT thread to store part of a word-sized value in a unit transaction.
 */
_CALLCONV int
stm_unit_store2(volatile stm_word_t *addr, stm_word_t value, stm_word_t mask, stm_word_t *timestamp)
{
  return stm_unit_write(addr, value, mask, timestamp);
}

/*
 * Enable or disable extensions and set upper bound on snapshot.
 */
static INLINE void
int_stm_set_extension(stm_tx_t *tx, int enable, stm_word_t *timestamp)
{
#ifdef UNIT_TX
  tx->attr.no_extend = !enable;
  if (timestamp != NULL && *timestamp < tx->end)
    tx->end = *timestamp;
#else /* ! UNIT_TX */
  fprintf(stderr, "Unit transaction is not enabled\n");
  exit(-1);
#endif /* ! UNIT_TX */
}

_CALLCONV void
stm_set_extension(int enable, stm_word_t *timestamp)
{
  TX_GET;
  int_stm_set_extension(tx, enable, timestamp);
}

_CALLCONV void
stm_set_extension_tx(stm_tx_t *tx, int enable, stm_word_t *timestamp)
{
  int_stm_set_extension(tx, enable, timestamp);
}

/*
 * Get curent value of global clock.
 */
_CALLCONV stm_word_t
stm_get_clock(void)
{
  return GET_CLOCK;
}

/*
 * Get current transaction descriptor.
 */
_CALLCONV stm_tx_t *
stm_current_tx(void)
{
  return tls_get_tx();
}

/* ################################################################### *
 * UNDOCUMENTED STM FUNCTIONS (USE WITH CARE!)
 * ################################################################### */

#ifdef CONFLICT_TRACKING
/*
 * Get thread identifier of other transaction.
 */
_CALLCONV int
stm_get_thread_id(stm_tx_t *tx, pthread_t *id)
{
  *id = tx->thread_id;
  return 1;
}

/*
 * Set global conflict callback.
 */
_CALLCONV int
stm_set_conflict_cb(void (*on_conflict)(stm_tx_t *tx1, stm_tx_t *tx2))
{
  _tinystm.conflict_cb = on_conflict;
  return 1;
}
#endif /* CONFLICT_TRACKING */

/*
 * Set the CURRENT transaction as irrevocable.
 */
static INLINE int
int_stm_set_irrevocable(stm_tx_t *tx, int serial)
{
#ifdef IRREVOCABLE_ENABLED
# if CM == CM_MODULAR
  stm_word_t t;
# endif /* CM == CM_MODULAR */

  if (!IS_ACTIVE(tx->status) && serial != -1) {
    /* Request irrevocability outside of a transaction or in abort handler (for next execution) */
    tx->irrevocable = 1 + (serial ? 0x08 : 0);
    return 0;
  }

  /* Are we already in irrevocable mode? */
  if ((tx->irrevocable & 0x07) == 3) {
    return 1;
  }

  if (tx->irrevocable == 0) {
    /* Acquire irrevocability for the first time */
    tx->irrevocable = 1 + (serial ? 0x08 : 0);
    /* Try acquiring global lock */
    if (_tinystm.irrevocable == 1 || ATOMIC_CAS_FULL(&_tinystm.irrevocable, 0, 1) == 0) {
      /* Transaction will acquire irrevocability after rollback */
      stm_rollback(tx, STM_ABORT_IRREVOCABLE);
      return 0;
    }
    /* Success: remember we have the lock */
    tx->irrevocable++;
    /* Try validating transaction */
#if DESIGN == WRITE_BACK_ETL
    if (!stm_wbetl_validate(tx)) {
      stm_rollback(tx, STM_ABORT_VALIDATE);
      return 0;
    }
#elif DESIGN == WRITE_BACK_CTL
    if (!stm_wbctl_validate(tx)) {
      stm_rollback(tx, STM_ABORT_VALIDATE);
      return 0;
    }
#elif DESIGN == WRITE_THROUGH
    if (!stm_wt_validate(tx)) {
      stm_rollback(tx, STM_ABORT_VALIDATE);
      return 0;
    }
#elif DESIGN == MODULAR
    if ((tx->attr.id == WRITE_BACK_CTL && stm_wbctl_validate(tx))
       || (tx->attr.id == WRITE_THROUGH && stm_wt_validate(tx))
       || (tx->attr.id != WRITE_BACK_CTL && tx->attr.id != WRITE_THROUGH && stm_wbetl_validate(tx))) {
      stm_rollback(tx, STM_ABORT_VALIDATE);
      return 0;
    }
#endif /* DESIGN == MODULAR */

# if CM == CM_MODULAR
   /* We might still abort if we cannot set status (e.g., we are being killed) */
    t = tx->status;
    if (GET_STATUS(t) != TX_ACTIVE || ATOMIC_CAS_FULL(&tx->status, t, t + (TX_IRREVOCABLE - TX_ACTIVE)) == 0) {
      stm_rollback(tx, STM_ABORT_KILLED);
      return 0;
    }
# endif /* CM == CM_MODULAR */
    if (serial && tx->w_set.nb_entries != 0) {
      /* TODO: or commit the transaction when we have the irrevocability. */
      /* Don't mix transactional and direct accesses => restart with direct accesses */
      stm_rollback(tx, STM_ABORT_IRREVOCABLE);
      return 0;
    }
  } else if ((tx->irrevocable & 0x07) == 1) {
    /* Acquire irrevocability after restart (no need to validate) */
    while (_tinystm.irrevocable == 1 || ATOMIC_CAS_FULL(&_tinystm.irrevocable, 0, 1) == 0)
      ;
    /* Success: remember we have the lock */
    tx->irrevocable++;
  }
  assert((tx->irrevocable & 0x07) == 2);

  /* Are we in serial irrevocable mode? */
  if ((tx->irrevocable & 0x08) != 0) {
    /* Stop all other threads */
    if (stm_quiesce(tx, 1) != 0) {
      /* Another thread is quiescing and we are active (trying to acquire irrevocability) */
      assert(serial != -1);
      stm_rollback(tx, STM_ABORT_IRREVOCABLE);
      return 0;
    }
  }

  /* We are in irrevocable mode */
  tx->irrevocable++;

#else /* ! IRREVOCABLE_ENABLED */
  fprintf(stderr, "Irrevocability is not supported in this configuration\n");
  exit(-1);
#endif /* ! IRREVOCABLE_ENABLED */
  return 1;
}

_CALLCONV NOINLINE int
stm_set_irrevocable(int serial)
{
  TX_GET;
  return int_stm_set_irrevocable(tx, serial);
}

_CALLCONV NOINLINE int
stm_set_irrevocable_tx(stm_tx_t *tx, int serial)
{
  return int_stm_set_irrevocable(tx, serial);
}

/*
 * Increment the value of global clock (Only for TinySTM developers).
 */
void
stm_inc_clock(void)
{
  FETCH_INC_CLOCK;
}


================================================
FILE: stms/tinystm/src/stm_internal.h
================================================
/*
 * File:
 *   stm_internal.h
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * Description:
 *   STM internal functions.
 *
 * Copyright (c) 2007-2014.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * This program has a dual license and can also be distributed
 * under the terms of the MIT license.
 */

#ifndef _STM_INTERNAL_H_
#define _STM_INTERNAL_H_

#include <pthread.h>
#include <string.h>
#include <stm.h>
#include "tls.h"
#include "utils.h"
#include "atomic.h"
#include "gc.h"

/* ################################################################### *
 * DEFINES
 * ################################################################### */

/* Designs */
#define WRITE_BACK_ETL                  0
#define WRITE_BACK_CTL                  1
#define WRITE_THROUGH                   2
#define MODULAR                         3

#ifndef DESIGN
# define DESIGN                         WRITE_BACK_ETL
#endif /* ! DESIGN */

/* Contention managers */
#define CM_SUICIDE                      0
#define CM_DELAY                        1
#define CM_BACKOFF                      2
#define CM_MODULAR                      3

#ifndef CM
# define CM                             CM_SUICIDE
#endif /* ! CM */

#if DESIGN != WRITE_BACK_ETL && CM == CM_MODULAR
# error "MODULAR contention manager can only be used with WB-ETL design"
#endif /* DESIGN != WRITE_BACK_ETL && CM == CM_MODULAR */

#if defined(CONFLICT_TRACKING) && ! defined(EPOCH_GC)
# error "CONFLICT_TRACKING requires EPOCH_GC"
#endif /* defined(CONFLICT_TRACKING) && ! defined(EPOCH_GC) */

#if CM == CM_MODULAR && ! defined(EPOCH_GC)
# error "MODULAR contention manager requires EPOCH_GC"
#endif /* CM == CM_MODULAR && ! defined(EPOCH_GC) */

#if defined(READ_LOCKED_DATA) && CM != CM_MODULAR
# error "READ_LOCKED_DATA can only be used with MODULAR contention manager"
#endif /* defined(READ_LOCKED_DATA) && CM != CM_MODULAR */

#if defined(EPOCH_GC) && defined(SIGNAL_HANDLER)
# error "SIGNAL_HANDLER can only be used without EPOCH_GC"
#endif /* defined(EPOCH_GC) && defined(SIGNAL_HANDLER) */

#define TX_GET                          stm_tx_t *tx = tls_get_tx()

#ifndef RW_SET_SIZE
# define RW_SET_SIZE                    4096                /* Initial size of read/write sets */
#endif /* ! RW_SET_SIZE */

#ifndef LOCK_ARRAY_LOG_SIZE
# define LOCK_ARRAY_LOG_SIZE            20                  /* Size of lock array: 2^20 = 1M */
#endif /* LOCK_ARRAY_LOG_SIZE */

#ifndef LOCK_SHIFT_EXTRA
# define LOCK_SHIFT_EXTRA               2                   /* 2 extra shift */
#endif /* LOCK_SHIFT_EXTRA */

#if CM == CM_BACKOFF
# ifndef MIN_BACKOFF
#  define MIN_BACKOFF                   (1UL << 2)
# endif /* MIN_BACKOFF */
# ifndef MAX_BACKOFF
#  define MAX_BACKOFF                   (1UL << 31)
# endif /* MAX_BACKOFF */
#endif /* CM == CM_BACKOFF */

#if CM == CM_MODULAR
# define VR_THRESHOLD                   "VR_THRESHOLD"
# ifndef VR_THRESHOLD_DEFAULT
#  define VR_THRESHOLD_DEFAULT          3                   /* -1 means no visible reads. 0 means always use visible reads. */
# endif /* VR_THRESHOLD_DEFAULT */
#endif /* CM == CM_MODULAR */

#define NO_SIGNAL_HANDLER               "NO_SIGNAL_HANDLER"

#if defined(CTX_LONGJMP)
# define JMP_BUF                        jmp_buf
# define LONGJMP(ctx, value)            longjmp(ctx, value)
#elif defined(CTX_ITM)
/* TODO adjust size to real size. */
# define JMP_BUF                        jmp_buf
# define LONGJMP(ctx, value)            CTX_ITM(value, ctx)
#else /* !CTX_LONGJMP && !CTX_ITM */
# define JMP_BUF                        sigjmp_buf
# define LONGJMP(ctx, value)            siglongjmp(ctx, value)
#endif /* !CTX_LONGJMP && !CTX_ITM */


/* ################################################################### *
 * TYPES
 * ################################################################### */

enum {                                  /* Transaction status */
  TX_IDLE = 0,
  TX_ACTIVE = 1,                        /* Lowest bit indicates activity */
  TX_COMMITTED = (1 << 1),
  TX_ABORTED = (2 << 1),
  TX_COMMITTING = (1 << 1) | TX_ACTIVE,
  TX_ABORTING = (2 << 1) | TX_ACTIVE,
  TX_KILLED = (3 << 1) | TX_ACTIVE,
  TX_IRREVOCABLE = 0x08 | TX_ACTIVE     /* Fourth bit indicates irrevocability */
};
#define STATUS_BITS                     4
#define STATUS_MASK                     ((1 << STATUS_BITS) - 1)

#if CM == CM_MODULAR
# define SET_STATUS(s, v)               ATOMIC_STORE_REL(&(s), ((s) & ~(stm_word_t)STATUS_MASK) | (v))
# define INC_STATUS_COUNTER(s)          ((((s) >> STATUS_BITS) + 1) << STATUS_BITS)
# define UPDATE_STATUS(s, v)            ATOMIC_STORE_REL(&(s), INC_STATUS_COUNTER(s) | (v))
# define GET_STATUS(s)                  ((s) & STATUS_MASK)
# define GET_STATUS_COUNTER(s)          ((s) >> STATUS_BITS)
#else /* CM != CM_MODULAR */
# define SET_STATUS(s, v)               ((s) = (v))
# define UPDATE_STATUS(s, v)            ((s) = (v))
# define GET_STATUS(s)                  ((s))
#endif /* CM != CM_MODULAR */
#define IS_ACTIVE(s)                    ((GET_STATUS(s) & 0x01) == TX_ACTIVE)

/* ################################################################### *
 * LOCKS
 * ################################################################### */

/*
 * A lock is a unsigned integer of the size of a pointer.
 * The LSB is the lock bit. If it is set, this means:
 * - At least some covered memory addresses is being written.
 * - All bits of the lock apart from the lock bit form
 *   a pointer that points to the write log entry holding the new
 *   value. Multiple values covered by the same log entry and orginized
 *   in a linked list in the write log.
 * If the lock bit is not set, then:
 * - All covered memory addresses contain consistent values.
 * - All bits of the lock besides the lock bit contain a version number
 *   (timestamp).
 *   - The high order bits contain the commit time.
 *   - The low order bits contain an incarnation number (incremented
 *     upon abort while writing the covered memory addresses).
 * When visible reads are enabled, two bits are used as read and write
 * locks. A read-locked address can be read by an invisible reader.
 */

#if CM == CM_MODULAR
# define OWNED_BITS                     2                   /* 2 bits */
# define WRITE_MASK                     0x01                /* 1 bit */
# define READ_MASK                      0x02                /* 1 bit */
# define OWNED_MASK                     (WRITE_MASK | READ_MASK)
#else /* CM != CM_MODULAR */
# define OWNED_BITS                     1                   /* 1 bit */
# define WRITE_MASK                     0x01                /* 1 bit */
# define OWNED_MASK                     (WRITE_MASK)
#endif /* CM != CM_MODULAR */
#define INCARNATION_BITS                3                   /* 3 bits */
#define INCARNATION_MAX                 ((1 << INCARNATION_BITS) - 1)
#define INCARNATION_MASK                (INCARNATION_MAX << 1)
#define LOCK_BITS                       (OWNED_BITS + INCARNATION_BITS)
#define MAX_THREADS                     8192                /* Upper bound (large enough) */
#define VERSION_MAX                     ((~(stm_word_t)0 >> LOCK_BITS) - MAX_THREADS)

#define LOCK_GET_OWNED(l)               (l & OWNED_MASK)
#define LOCK_GET_WRITE(l)               (l & WRITE_MASK)
#define LOCK_SET_ADDR_WRITE(a)          (a | WRITE_MASK)    /* WRITE bit set */
#define LOCK_GET_ADDR(l)                (l & ~(stm_word_t)OWNED_MASK)
#if CM == CM_MODULAR
# define LOCK_GET_READ(l)               (l & READ_MASK)
# define LOCK_SET_ADDR_READ(a)          (a | READ_MASK)     /* READ bit set */
# define LOCK_UPGRADE(l)                (l | WRITE_MASK)
#endif /* CM == CM_MODULAR */
#define LOCK_GET_TIMESTAMP(l)           (l >> (LOCK_BITS))
#define LOCK_SET_TIMESTAMP(t)           (t << (LOCK_BITS))
#define LOCK_GET_INCARNATION(l)         ((l & INCARNATION_MASK) >> OWNED_BITS)
#define LOCK_SET_INCARNATION(i)         (i << OWNED_BITS)   /* OWNED bit not set */
#define LOCK_UPD_INCARNATION(l, i)      ((l & ~(stm_word_t)(INCARNATION_MASK | OWNED_MASK)) | LOCK_SET_INCARNATION(i))
#ifdef UNIT_TX
# define LOCK_UNIT                       (~(stm_word_t)0)
#endif /* UNIT_TX */

/*
 * We use the very same hash functions as TL2 for degenerate Bloom
 * filters on 32 bits.
 */
#ifdef USE_BLOOM_FILTER
# define FILTER_HASH(a)                 (((stm_word_t)a >> 2) ^ ((stm_word_t)a >> 5))
# define FILTER_BITS(a)                 (1 << (FILTER_HASH(a) & 0x1F))
#endif /* USE_BLOOM_FILTER */

/*
 * We use an array of locks and hash the address to find the location of the lock.
 * We try to avoid collisions as much as possible (two addresses covered by the same lock).
 */
#define LOCK_ARRAY_SIZE                 (1 << LOCK_ARRAY_LOG_SIZE)
#define LOCK_MASK                       (LOCK_ARRAY_SIZE - 1)
#define LOCK_SHIFT                      (((sizeof(stm_word_t) == 4) ? 2 : 3) + LOCK_SHIFT_EXTRA)
#define LOCK_IDX(a)                     (((stm_word_t)(a) >> LOCK_SHIFT) & LOCK_MASK)
#ifdef LOCK_IDX_SWAP
# if LOCK_ARRAY_LOG_SIZE < 16
#  error "LOCK_IDX_SWAP requires LOCK_ARRAY_LOG_SIZE to be at least 16"
# endif /* LOCK_ARRAY_LOG_SIZE < 16 */
# define GET_LOCK(a)                    (_tinystm.locks + lock_idx_swap(LOCK_IDX(a)))
#else /* ! LOCK_IDX_SWAP */
# define GET_LOCK(a)                    (_tinystm.locks + LOCK_IDX(a))
#endif /* ! LOCK_IDX_SWAP */

/* ################################################################### *
 * CLOCK
 * ################################################################### */

/* At least twice a cache line (not required if properly aligned and padded) */
#define CLOCK                           (_tinystm.gclock[(CACHELINE_SIZE * 2) / sizeof(stm_word_t)])

#define GET_CLOCK                       (ATOMIC_LOAD_ACQ(&CLOCK))
#define FETCH_INC_CLOCK                 (ATOMIC_FETCH_INC_FULL(&CLOCK))

/* ################################################################### *
 * CALLBACKS
 * ################################################################### */

/* The number of 7 is chosen to make fit the number and the array in a
 * same cacheline (assuming 64bytes cacheline and 64bits CPU). */
#define MAX_CB                          7

/* Declare as static arrays (vs. lists) to improve cache locality */
/* The number transaction local specific for modules. */
#ifndef MAX_SPECIFIC
# define MAX_SPECIFIC                   7
#endif /* MAX_SPECIFIC */


typedef struct r_entry {                /* Read set entry */
  stm_word_t version;                   /* Version read */
  volatile stm_word_t *lock;            /* Pointer to lock (for fast access) */
} r_entry_t;

typedef struct r_set {                  /* Read set */
  r_entry_t *entries;                   /* Array of entries */
  unsigned int nb_entries;              /* Number of entries */
  unsigned int size;                    /* Size of array */
} r_set_t;

typedef struct w_entry {                /* Write set entry */
  union {                               /* For padding... */
    struct {
      volatile stm_word_t *addr;        /* Address written */
      stm_word_t value;                 /* New (write-back) or old (write-through) value */
      stm_word_t mask;                  /* Write mask */
      stm_word_t version;               /* Version overwritten */
      volatile stm_word_t *lock;        /* Pointer to lock (for fast access) */
#if CM == CM_MODULAR || defined(CONFLICT_TRACKING)
      struct stm_tx *tx;                /* Transaction owning the write set */
#endif /* CM == CM_MODULAR || defined(CONFLICT_TRACKING) */
      union {
        struct w_entry *next;           /* WRITE_BACK_ETL || WRITE_THROUGH: Next address covered by same lock (if any) */
        stm_word_t no_drop;             /* WRITE_BACK_CTL: Should we drop lock upon abort? */
      };
    };
    char padding[CACHELINE_SIZE];       /* Padding (multiple of a cache line) */
    /* Note padding is not useful here as long as the address can be defined in the lock scheme. */
  };
} w_entry_t;

typedef struct w_set {                  /* Write set */
  w_entry_t *entries;                   /* Array of entries */
  unsigned int nb_entries;              /* Number of entries */
  unsigned int size;                    /* Size of array */
  union {
    unsigned int has_writes;            /* WRITE_BACK_ETL: Has the write set any real write (vs. visible reads) */
    unsigned int nb_acquired;           /* WRITE_BACK_CTL: Number of locks acquired */
  };
#ifdef USE_BLOOM_FILTER
  stm_word_t bloom;                     /* WRITE_BACK_CTL: Same Bloom filter as in TL2 */
#endif /* USE_BLOOM_FILTER */
} w_set_t;

typedef struct cb_entry {               /* Callback entry */
  void (*f)(void *);                    /* Function */
  void *arg;                            /* Argument to be passed to function */
} cb_entry_t;

typedef struct stm_tx {                 /* Transaction descriptor */
  JMP_BUF env;                          /* Environment for setjmp/longjmp */
  stm_tx_attr_t attr;                   /* Transaction attributes (user-specified) */
  volatile stm_word_t status;           /* Transaction status */
  stm_word_t start;                     /* Start timestamp */
  stm_word_t end;                       /* End timestamp (validity range) */
  r_set_t r_set;                        /* Read set */
  w_set_t w_set;                        /* Write set */
#ifdef IRREVOCABLE_ENABLED
  unsigned int irrevocable:4;           /* Is this execution irrevocable? */
#endif /* IRREVOCABLE_ENABLED */
  unsigned int nesting;                 /* Nesting level */
#if CM == CM_MODULAR
  stm_word_t timestamp;                 /* Timestamp (not changed upon restart) */
#endif /* CM == CM_MODULAR */
  void *data[MAX_SPECIFIC];             /* Transaction-specific data (fixed-size array for better speed) */
  struct stm_tx *next;                  /* For keeping track of all transactional threads */
#ifdef CONFLICT_TRACKING
  pthread_t thread_id;                  /* Thread identifier (immutable) */
#endif /* CONFLICT_TRACKING */
#if CM == CM_DELAY || CM == CM_MODULAR
  volatile stm_word_t *c_lock;          /* Pointer to contented lock (cause of abort) */
#endif /* CM == CM_DELAY || CM == CM_MODULAR */
#if CM == CM_BACKOFF
  unsigned long backoff;                /* Maximum backoff duration */
  unsigned long seed;                   /* RNG seed */
#endif /* CM == CM_BACKOFF */
#if CM == CM_MODULAR
  int visible_reads;                    /* Should we use visible reads? */
#endif /* CM == CM_MODULAR */
#if CM == CM_MODULAR || defined(TM_STATISTICS)
  unsigned int stat_retries;            /* Number of consecutive aborts (retries) */
#endif /* CM == CM_MODULAR || defined(TM_STATISTICS) */
#ifdef TM_STATISTICS
  unsigned int stat_commits;            /* Total number of commits (cumulative) */
  unsigned int stat_aborts;             /* Total number of aborts (cumulative) */
  unsigned int stat_retries_max;        /* Maximum number of consecutive aborts (retries) */
#endif /* TM_STATISTICS */
#ifdef TM_STATISTICS2
  unsigned int stat_aborts_1;           /* Total number of transactions that abort once or more (cumulative) */
  unsigned int stat_aborts_2;           /* Total number of transactions that abort twice or more (cumulative) */
  unsigned int stat_aborts_r[16];       /* Total number of transactions that abort wrt. abort reason (cumulative) */
# ifdef READ_LOCKED_DATA
  unsigned int stat_locked_reads_ok;    /* Successful reads of previous value */
  unsigned int stat_locked_reads_failed;/* Failed reads of previous value */
# endif /* READ_LOCKED_DATA */
#endif /* TM_STATISTICS2 */
} stm_tx_t;

/* This structure should be ordered by hot and cold variables */
typedef struct {
  volatile stm_word_t locks[LOCK_ARRAY_SIZE] ALIGNED;
  volatile stm_word_t gclock[512 / sizeof(stm_word_t)] ALIGNED;
  unsigned int nb_specific;             /* Number of specific slots used (<= MAX_SPECIFIC) */
  unsigned int nb_init_cb;
  cb_entry_t init_cb[MAX_CB];           /* Init thread callbacks */
  unsigned int nb_exit_cb;
  cb_entry_t exit_cb[MAX_CB];           /* Exit thread callbacks */
  unsigned int nb_start_cb;
  cb_entry_t start_cb[MAX_CB];          /* Start callbacks */
  unsigned int nb_precommit_cb;
  cb_entry_t precommit_cb[MAX_CB];      /* Commit callbacks */
  unsigned int nb_commit_cb;
  cb_entry_t commit_cb[MAX_CB];         /* Commit callbacks */
  unsigned int nb_abort_cb;
  cb_entry_t abort_cb[MAX_CB];          /* Abort callbacks */
  unsigned int initialized;             /* Has the library been initialized? */
#ifdef IRREVOCABLE_ENABLED
  volatile stm_word_t irrevocable;      /* Irrevocability status */
#endif /* IRREVOCABLE_ENABLED */
  volatile stm_word_t quiesce;          /* Prevent threads from entering transactions upon quiescence */
  volatile stm_word_t threads_nb;       /* Number of active threads */
  stm_tx_t *threads;                    /* Head of linked list of threads */
  pthread_mutex_t quiesce_mutex;        /* Mutex to support quiescence */
  pthread_cond_t quiesce_cond;          /* Condition variable to support quiescence */
#if CM == CM_MODULAR
  int vr_threshold;                     /* Number of retries before to switch to visible reads. */
#endif /* CM == CM_MODULAR */
#ifdef CONFLICT_TRACKING
  void (*conflict_cb)(stm_tx_t *, stm_tx_t *);
#endif /* CONFLICT_TRACKING */
#if CM == CM_MODULAR
  int (*contention_manager)(stm_tx_t *, stm_tx_t *, int);
#endif /* CM == CM_MODULAR */
  /* At least twice a cache line (256 bytes to be on the safe side) */
  char padding[CACHELINE_SIZE];
} ALIGNED global_t;

extern global_t _tinystm;

#if CM == CM_MODULAR
# define KILL_SELF                      0x00
# define KILL_OTHER                     0x01
# define DELAY_RESTART                  0x04

# define RR_CONFLICT                    0x00
# define RW_CONFLICT                    0x01
# define WR_CONFLICT                    0x02
# define WW_CONFLICT                    0x03
#endif /* CM == CM_MODULAR */

/* ################################################################### *
 * FUNCTIONS DECLARATIONS
 * ################################################################### */

static NOINLINE void
stm_rollback(stm_tx_t *tx, unsigned int reason);

/* ################################################################### *
 * INLINE FUNCTIONS
 * ################################################################### */

#ifdef LOCK_IDX_SWAP
/*
 * Compute index in lock table (swap bytes to avoid consecutive addresses to have neighboring locks).
 */
static INLINE unsigned int
lock_idx_swap(unsigned int idx)
{
  return (idx & ~(unsigned int)0xFFFF) | ((idx & 0x00FF) << 8) | ((idx & 0xFF00) >> 8);
}
#endif /* LOCK_IDX_SWAP */


/*
 * Initialize quiescence support.
 */
static INLINE void
stm_quiesce_init(void)
{
  PRINT_DEBUG("==> stm_quiesce_init()\n");

  if (pthread_mutex_init(&_tinystm.quiesce_mutex, NULL) != 0) {
    fprintf(stderr, "Error creating mutex\n");
    exit(1);
  }
  if (pthread_cond_init(&_tinystm.quiesce_cond, NULL) != 0) {
    fprintf(stderr, "Error creating condition variable\n");
    exit(1);
  }
  _tinystm.quiesce = 0;
  _tinystm.threads_nb = 0;
  _tinystm.threads = NULL;
}

/*
 * Clean up quiescence support.
 */
static INLINE void
stm_quiesce_exit(void)
{
  PRINT_DEBUG("==> stm_quiesce_exit()\n");

  pthread_cond_destroy(&_tinystm.quiesce_cond);
  pthread_mutex_destroy(&_tinystm.quiesce_mutex);
}

/*
 * Called by each thread upon initialization for quiescence support.
 */
static INLINE void
stm_quiesce_enter_thread(stm_tx_t *tx)
{
  PRINT_DEBUG("==> stm_quiesce_enter_thread(%p)\n", tx);

  pthread_mutex_lock(&_tinystm.quiesce_mutex);
  /* Add new descriptor at head of list */
  tx->next = _tinystm.threads;
  _tinystm.threads = tx;
  _tinystm.threads_nb++;
  pthread_mutex_unlock(&_tinystm.quiesce_mutex);
}

/*
 * Called by each thread upon exit for quiescence support.
 */
static INLINE void
stm_quiesce_exit_thread(stm_tx_t *tx)
{
  stm_tx_t *t, *p;

  PRINT_DEBUG("==> stm_quiesce_exit_thread(%p)\n", tx);

  /* Can only be called if non-active */
  assert(!IS_ACTIVE(tx->status));

  pthread_mutex_lock(&_tinystm.quiesce_mutex);
  /* Remove descriptor from list */
  p = NULL;
  t = _tinystm.threads;
  while (t != tx) {
    assert(t != NULL);
    p = t;
    t = t->next;
  }
  if (p == NULL)
    _tinystm.threads = t->next;
  else
    p->next = t->next;
  _tinystm.threads_nb--;
  if (_tinystm.quiesce) {
    /* Wake up someone in case other threads are waiting for us */
    pthread_cond_signal(&_tinystm.quiesce_cond);
  }
  pthread_mutex_unlock(&_tinystm.quiesce_mutex);
}

/*
 * Wait for all transactions to be block on a barrier.
 */
static NOINLINE void
stm_quiesce_barrier(stm_tx_t *tx, void (*f)(void *), void *arg)
{
  PRINT_DEBUG("==> stm_quiesce_barrier()\n");

  /* Can only be called if non-active */
  assert(tx == NULL || !IS_ACTIVE(tx->status));

  pthread_mutex_lock(&_tinystm.quiesce_mutex);
  /* Wait for all other transactions to block on barrier */
  _tinystm.threads_nb--;
  if (_tinystm.quiesce == 0) {
    /* We are first on the barrier */
    _tinystm.quiesce = 1;
  }
  while (_tinystm.quiesce) {
    if (_tinystm.threads_nb == 0) {
      /* Everybody is blocked */
      if (f != NULL)
        f(arg);
      /* Release transactional threads */
      _tinystm.quiesce = 0;
      pthread_cond_broadcast(&_tinystm.quiesce_cond);
    } else {
      /* Wait for other transactions to stop */
      pthread_cond_wait(&_tinystm.quiesce_cond, &_tinystm.quiesce_mutex);
    }
  }
  _tinystm.threads_nb++;
  pthread_mutex_unlock(&_tinystm.quiesce_mutex);
}

/*
 * Wait for all transactions to be out of their current transaction.
 */
static INLINE int
stm_quiesce(stm_tx_t *tx, int block)
{
  stm_tx_t *t;
#if CM == CM_MODULAR
  stm_word_t s, c;
#endif /* CM == CM_MODULAR */

  PRINT_DEBUG("==> stm_quiesce(%p,%d)\n", tx, block);

  if (IS_ACTIVE(tx->status)) {
    /* Only one active transaction can quiesce at a time, others must abort */
    if (pthread_mutex_trylock(&_tinystm.quiesce_mutex) != 0)
      return 1;
  } else {
    /* We can safely block because we are inactive */
    pthread_mutex_lock(&_tinystm.quiesce_mutex);
  }
  /* We own the lock at this point */
  if (block)
    ATOMIC_STORE_REL(&_tinystm.quiesce, 2);
  /* Make sure we read latest status data */
  ATOMIC_MB_FULL;
  /* Not optimal as we check transaction sequentially and might miss some inactivity states */
  for (t = _tinystm.threads; t != NULL; t = t->next) {
    if (t == tx)
      continue;
    /* Wait for all other transactions to become inactive */
#if CM == CM_MODULAR
    s = t->status;
    if (IS_ACTIVE(s)) {
      c = GET_STATUS_COUNTER(s);
      do {
        s = t->status;
      } while (IS_ACTIVE(s) && c == GET_STATUS_COUNTER(s));
    }
#else /* CM != CM_MODULAR */
    while (IS_ACTIVE(t->status))
      ;
#endif /* CM != CM_MODULAR */
  }
  if (!block)
    pthread_mutex_unlock(&_tinystm.quiesce_mutex);
  return 0;
}

/*
 * Check if transaction must block.
 */
static INLINE int
stm_check_quiesce(stm_tx_t *tx)
{
  stm_word_t s;

  /* Must be called upon start (while already active but before acquiring any lock) */
  assert(IS_ACTIVE(tx->status));

  /* ATOMIC_MB_FULL;  The full memory barrier is not required here since quiesce
   * is atomic. Only a compiler barrier is needed to avoid reordering. */
  ATOMIC_CB;

  if (unlikely(ATOMIC_LOAD_ACQ(&_tinystm.quiesce) == 2)) {
#ifdef IRREVOCABLE_ENABLED
    /* Only test it when quiesce == 2, it avoids one comparison for fast-path. */
    /* TODO check if it is correct. */
    if (unlikely((tx->irrevocable & 0x08) != 0)) {
      /* Serial irrevocable mode: we are executing alone */
      return 0;
    }
#endif /* IRREVOCABLE_ENABLED */
    s = ATOMIC_LOAD(&tx->status);
    SET_STATUS(tx->status, TX_IDLE);
    while (ATOMIC_LOAD_ACQ(&_tinystm.quiesce) == 2) {
#ifdef WAIT_YIELD
      sched_yield();
#endif /* WAIT_YIELD */
    }
    SET_STATUS(tx->status, GET_STATUS(s));
    return 1;
  }
  return 0;
}

/*
 * Release threads blocked after quiescence.
 */
static INLINE void
stm_quiesce_release(stm_tx_t *tx)
{
  ATOMIC_STORE_REL(&_tinystm.quiesce, 0);
  pthread_mutex_unlock(&_tinystm.quiesce_mutex);
}

/*
 * Reset clock and timestamps
 */
static INLINE void
rollover_clock(void *arg)
{
  PRINT_DEBUG("==> rollover_clock()\n");

  /* Reset clock */
  CLOCK = 0;
  /* Reset timestamps */
  memset((void *)_tinystm.locks, 0, LOCK_ARRAY_SIZE * sizeof(stm_word_t));
# ifdef EPOCH_GC
  /* Reset GC */
  gc_reset();
# endif /* EPOCH_GC */
}

/*
 * Check if stripe has been read previously.
 */
static INLINE r_entry_t *
stm_has_read(stm_tx_t *tx, volatile stm_word_t *lock)
{
  r_entry_t *r;
  int i;

  PRINT_DEBUG("==> stm_has_read(%p[%lu-%lu],%p)\n", tx, (unsigned long)tx->start, (unsigned long)tx->end, lock);

#if CM == CM_MODULAR
  /* TODO case of visible read is not handled */
#endif /* CM == CM_MODULAR */

  /* Look for read */
  r = tx->r_set.entries;
  for (i = tx->r_set.nb_entries; i > 0; i--, r++) {
    if (r->lock == lock) {
      /* Return first match*/
      return r;
    }
  }
  return NULL;
}

/*
 * Check if address has been written previously.
 */
static INLINE w_entry_t *
stm_has_written(stm_tx_t *tx, volatile stm_word_t *addr)
{
  w_entry_t *w;
  int i;
# ifdef USE_BLOOM_FILTER
  stm_word_t mask;
# endif /* USE_BLOOM_FILTER */

  PRINT_DEBUG("==> stm_has_written(%p[%lu-%lu],%p)\n", tx, (unsigned long)tx->start, (unsigned long)tx->end, addr);

# ifdef USE_BLOOM_FILTER
  mask = FILTER_BITS(addr);
  if ((tx->w_set.bloom & mask) != mask)
    return NULL;
# endif /* USE_BLOOM_FILTER */

  /* Look for write */
  w = tx->w_set.entries;
  for (i = tx->w_set.nb_entries; i > 0; i--, w++) {
    if (w->addr == addr) {
      return w;
    }
  }
  return NULL;
}

/*
 * (Re)allocate read set entries.
 */
static NOINLINE void
stm_allocate_rs_entries(stm_tx_t *tx, int extend)
{
  PRINT_DEBUG("==> stm_allocate_rs_entries(%p[%lu-%lu],%d)\n", tx, (unsigned long)tx->start, (unsigned long)tx->end, extend);

  if (extend) {
    /* Extend read set */
    tx->r_set.size *= 2;
    tx->r_set.entries = (r_entry_t *)xrealloc(tx->r_set.entries, tx->r_set.size * sizeof(r_entry_t));
  } else {
    /* Allocate read set */
    tx->r_set.entries = (r_entry_t *)xmalloc_aligned(tx->r_set.size * sizeof(r_entry_t));
  }
}

/*
 * (Re)allocate write set entries.
 */
static NOINLINE void
stm_allocate_ws_entries(stm_tx_t *tx, int extend)
{
#if CM == CM_MODULAR || defined(CONFLICT_TRACKING)
  int i, first = (extend ? tx->w_set.size : 0);
#endif /* CM == CM_MODULAR || defined(CONFLICT_TRACKING) */
#ifdef EPOCH_GC
  void *a;
#endif /* ! EPOCH_GC */

  PRINT_DEBUG("==> stm_allocate_ws_entries(%p[%lu-%lu],%d)\n", tx, (unsigned long)tx->start, (unsigned long)tx->end, extend);

  if (extend) {
    /* Extend write set */
    /* Transaction must be inactive for WRITE_THROUGH or WRITE_BACK_ETL */
    tx->w_set.size *= 2;
#ifdef EPOCH_GC
    a = tx->w_set.entries;
    tx->w_set.entries = (w_entry_t *)xmalloc_aligned(tx->w_set.size * sizeof(w_entry_t));
    memcpy(tx->w_set.entries, a, tx->w_set.size / 2 * sizeof(w_entry_t));
    gc_free(a, GET_CLOCK);
#else /* ! EPOCH_GC */
    tx->w_set.entries = (w_entry_t *)xrealloc(tx->w_set.entries, tx->w_set.size * sizeof(w_entry_t));
#endif /* ! EPOCH_GC */
  } else {
    /* Allocate write set */
    tx->w_set.entries = (w_entry_t *)xmalloc_aligned(tx->w_set.size * sizeof(w_entry_t));
  }
  /* Ensure that memory is aligned. */
  assert((((stm_word_t)tx->w_set.entries) & OWNED_MASK) == 0);

#if CM == CM_MODULAR || defined(CONFLICT_TRACKING)
  /* Initialize fields */
  for (i = first; i < tx->w_set.size; i++)
    tx->w_set.entries[i].tx = tx;
#endif /* CM == CM_MODULAR || defined(CONFLICT_TRACKING) */
}


#if DESIGN == WRITE_BACK_ETL
# include "stm_wbetl.h"
#elif DESIGN == WRITE_BACK_CTL
# include "stm_wbctl.h"
#elif DESIGN == WRITE_THROUGH
# include "stm_wt.h"
#elif DESIGN == MODULAR
# include "stm_wbetl.h"
# include "stm_wbctl.h"
# include "stm_wt.h"
#endif /* DESIGN == MODULAR */

#if CM == CM_MODULAR
/*
 * Kill other transaction.
 */
static NOINLINE int
stm_kill(stm_tx_t *tx, stm_tx_t *other, stm_word_t status)
{
  stm_word_t c, t;

  PRINT_DEBUG("==> stm_kill(%p[%lu-%lu],%p,s=%d)\n", tx, (unsigned long)tx->start, (unsigned long)tx->end, other, status);

# ifdef CONFLICT_TRACKING
  if (_tinystm.conflict_cb != NULL)
    _tinystm.conflict_cb(tx, other);
# endif /* CONFLICT_TRACKING */

# ifdef IRREVOCABLE_ENABLED
  if (GET_STATUS(status) == TX_IRREVOCABLE)
    return 0;
# endif /* IRREVOCABLE_ENABLED */
  if (GET_STATUS(status) == TX_ABORTED || GET_STATUS(status) == TX_COMMITTED || GET_STATUS(status) == TX_KILLED || GET_STATUS(status) == TX_IDLE)
    return 0;
  if (GET_STATUS(status) == TX_ABORTING || GET_STATUS(status) == TX_COMMITTING) {
    /* Transaction is already aborting or committing: wait */
    while (other->status == status)
      ;
    return 0;
  }
  assert(IS_ACTIVE(status));
  /* Set status to KILLED */
  if (ATOMIC_CAS_FULL(&other->status, status, status + (TX_KILLED - TX_ACTIVE)) == 0) {
    /* Transaction is committing/aborting (or has committed/aborted) */
    c = GET_STATUS_COUNTER(status);
    do {
      t = other->status;
# ifdef IRREVOCABLE_ENABLED
      if (GET_STATUS(t) == TX_IRREVOCABLE)
        return 0;
# endif /* IRREVOCABLE_ENABLED */
    } while (GET_STATUS(t) != TX_ABORTED && GET_STATUS(t) != TX_COMMITTED && GET_STATUS(t) != TX_KILLED && GET_STATUS(t) != TX_IDLE && GET_STATUS_COUNTER(t) == c);
    return 0;
  }
  /* We have killed the transaction: we can steal the lock */
  return 1;
}

/*
 * Drop locks after having been killed.
 */
static NOINLINE void
stm_drop(stm_tx_t *tx)
{
  w_entry_t *w;
  stm_word_t l;
  int i;

  PRINT_DEBUG("==> stm_drop(%p[%lu-%lu])\n", tx, (unsigned long)tx->start, (unsigned long)tx->end);

  /* Drop locks */
  i = tx->w_set.nb_entries;
  if (i > 0) {
    w = tx->w_set.entries;
    for (; i > 0; i--, w++) {
      l = ATOMIC_LOAD_ACQ(w->lock);
      if (LOCK_GET_OWNED(l) && (w_entry_t *)LOCK_GET_ADDR(l) == w) {
        /* Drop using CAS */
        ATOMIC_CAS_FULL(w->lock, l, LOCK_SET_TIMESTAMP(w->version));
        /* If CAS fail, lock has been stolen or already released in case a lock covers multiple addresses */
      }
    }
    /* We need to reallocate the write set to avoid an ABA problem (the
     * transaction could reuse the same entry after having been killed
     * and restarted, and another slow transaction could steal the lock
     * using CAS without noticing the restart) */
    gc_free(tx->w_set.entries, GET_CLOCK);
    stm_allocate_ws_entries(tx, 0);
  }
}
#endif /* CM == CM_MODULAR */

/*
 * Initialize the transaction descriptor before start or restart.
 */
static INLINE void
int_stm_prepare(stm_tx_t *tx)
{
#if CM == CM_MODULAR
  if (tx->attr.visible_reads || (tx->visible_reads >= _tinystm.vr_threshold && _tinystm.vr_threshold >= 0)) {
    /* Use visible read */
    tx->attr.visible_reads = 1;
    tx->attr.read_only = 0;
  }
#endif /* CM == CM_MODULAR */

  /* Read/write set */
  /* has_writes / nb_acquired are the same field. */
  tx->w_set.has_writes = 0;
  /* tx->w_set.nb_acquired = 0; */
#ifdef USE_BLOOM_FILTER
  tx->w_set.bloom = 0;
#endif /* USE_BLOOM_FILTER */
  tx->w_set.nb_entries = 0;
  tx->r_set.nb_entries = 0;

 start:
  /* Start timestamp */
  tx->start = tx->end = GET_CLOCK; /* OPT: Could be delayed until first read/write */
  if (unlikely(tx->start >= VERSION_MAX)) {
    /* Block all transactions and reset clock */
    stm_quiesce_barrier(tx, rollover_clock, NULL);
    goto start;
  }
#if CM == CM_MODULAR
  if (tx->stat_retries == 0)
    tx->timestamp = tx->start;
#endif /* CM == CM_MODULAR */

#ifdef EPOCH_GC
  gc_set_epoch(tx->start);
#endif /* EPOCH_GC */

#ifdef IRREVOCABLE_ENABLED
  if (unlikely(tx->irrevocable != 0)) {
    assert(!IS_ACTIVE(tx->status));
    stm_set_irrevocable_tx(tx, -1);
    UPDATE_STATUS(tx->status, TX_IRREVOCABLE);
  } else
    UPDATE_STATUS(tx->status, TX_ACTIVE);
#else /* ! IRREVOCABLE_ENABLED */
  /* Set status */
  UPDATE_STATUS(tx->status, TX_ACTIVE);
#endif /* ! IRREVOCABLE_ENABLED */

  stm_check_quiesce(tx);
}

/*
 * Rollback transaction.
 */
static NOINLINE void
stm_rollback(stm_tx_t *tx, unsigned int reason)
{
#if CM == CM_BACKOFF
  unsigned long wait;
  volatile int j;
#endif /* CM == CM_BACKOFF */
#if CM == CM_MODULAR
  stm_word_t t;
#endif /* CM == CM_MODULAR */

  PRINT_DEBUG("==> stm_rollback(%p[%lu-%lu])\n", tx, (unsigned long)tx->start, (unsigned long)tx->end);

  assert(IS_ACTIVE(tx->status));

#ifdef IRREVOCABLE_ENABLED
  /* Irrevocable cannot abort */
  assert((tx->irrevocable & 0x07) != 3);
#endif /* IRREVOCABLE_ENABLED */

#if CM == CM_MODULAR
  /* Set status to ABORTING */
  t = tx->status;
  if (GET_STATUS(t) == TX_KILLED || (GET_STATUS(t) == TX_ACTIVE && ATOMIC_CAS_FULL(&tx->status, t, t + (TX_ABORTING - TX_ACTIVE)) == 0)) {
    /* We have been killed */
    assert(GET_STATUS(tx->status) == TX_KILLED);
    /* Release locks */
    stm_drop(tx);
    goto dropped;
  }
#endif /* CM == CM_MODULAR */

#if DESIGN == WRITE_BACK_ETL
  stm_wbetl_rollback(tx);
#elif DESIGN == WRITE_BACK_CTL
  stm_wbctl_rollback(tx);
#elif DESIGN == WRITE_THROUGH
  stm_wt_rollback(tx);
#elif DESIGN == MODULAR
  if (tx->attr.id == WRITE_BACK_CTL)
    stm_wbctl_rollback(tx);
  else if (tx->attr.id == WRITE_THROUGH)
    stm_wt_rollback(tx);
  else
    stm_wbetl_rollback(tx);
#endif /* DESIGN == MODULAR */

#if CM == CM_MODULAR
 dropped:
#endif /* CM == CM_MODULAR */

#if CM == CM_MODULAR || defined(TM_STATISTICS)
  tx->stat_retries++;
#endif /* CM == CM_MODULAR || defined(TM_STATISTICS) */
#ifdef TM_STATISTICS
  tx->stat_aborts++;
  if (tx->stat_retries_max < tx->stat_retries)
    tx->stat_retries_max = tx->stat_retries;
#endif /* TM_STATISTICS */
#ifdef TM_STATISTICS2
  /* Aborts stats wrt reason */
  tx->stat_aborts_r[(reason >> 8) & 0x0F]++;
  if (tx->stat_retries == 1)
    tx->stat_aborts_1++;
  else if (tx->stat_retries == 2)
    tx->stat_aborts_2++;
#endif /* TM_STATISTICS2 */

  /* Set status to ABORTED */
  SET_STATUS(tx->status, TX_ABORTED);

  /* Abort for extending the write set */
  if (unlikely(reason == STM_ABORT_EXTEND_WS)) {
    stm_allocate_ws_entries(tx, 1);
  }

  /* Reset nesting level */
  tx->nesting = 1;

  /* Callbacks */
  if (likely(_tinystm.nb_abort_cb != 0)) {
    unsigned int cb;
    for (cb = 0; cb < _tinystm.nb_abort_cb; cb++)
      _tinystm.abort_cb[cb].f(_tinystm.abort_cb[cb].arg);
  }

#if CM == CM_BACKOFF
  /* Simple RNG (good enough for backoff) */
  tx->seed ^= (tx->seed << 17);
  tx->seed ^= (tx->seed >> 13);
  tx->seed ^= (tx->seed << 5);
  wait = tx->seed % tx->backoff;
  for (j = 0; j < wait; j++) {
    /* Do nothing */
  }
  if (tx->backoff < MAX_BACKOFF)
    tx->backoff <<= 1;
#endif /* CM == CM_BACKOFF */

#if CM == CM_DELAY || CM == CM_MODULAR
  /* Wait until contented lock is free */
  if (tx->c_lock != NULL) {
    /* Busy waiting (yielding is expensive) */
    while (LOCK_GET_OWNED(ATOMIC_LOAD(tx->c_lock))) {
# ifdef WAIT_YIELD
      sched_yield();
# endif /* WAIT_YIELD */
    }
    tx->c_lock = NULL;
  }
#endif /* CM == CM_DELAY || CM == CM_MODULAR */

  /* Don't prepare a new transaction if no retry. */
  if (tx->attr.no_retry || (reason & STM_ABORT_NO_RETRY) == STM_ABORT_NO_RETRY) {
    tx->nesting = 0;
    return;
  }

  /* Reset field to restart transaction */
  int_stm_prepare(tx);

  /* Jump back to transaction start */
  /* Note: ABI usually requires 0x09 (runInstrumented+restoreLiveVariable) */
#ifdef IRREVOCABLE_ENABLED
  /* If the transaction is serial irrevocable, indicate that uninstrumented
   * code path must be executed (mixing instrumented and uninstrumented
   * accesses are not allowed) */
  reason |= (tx->irrevocable == 0x0B) ? STM_PATH_UNINSTRUMENTED : STM_PATH_INSTRUMENTED;
#else /* ! IRREVOCABLE_ENABLED */
  reason |= STM_PATH_INSTRUMENTED;
#endif /* ! IRREVOCABLE_ENABLED */
  LONGJMP(tx->env, reason);
}

/*
 * Store a word-sized value (return write set entry or NULL).
 */
static INLINE w_entry_t *
stm_write(stm_tx_t *tx, volatile stm_word_t *addr, stm_word_t value, stm_word_t mask)
{
  w_entry_t *w;

  PRINT_DEBUG2("==> stm_write(t=%p[%lu-%lu],a=%p,d=%p-%lu,m=0x%lx)\n",
               tx, (unsigned long)tx->start, (unsigned long)tx->end, addr, (void *)value, (unsigned long)value, (unsigned long)mask);

#if CM == CM_MODULAR
  if (GET_STATUS(tx->status) == TX_KILLED) {
    stm_rollback(tx, STM_ABORT_KILLED);
    return NULL;
  }
#else /* CM != CM_MODULAR */
  assert(IS_ACTIVE(tx->status));
#endif /* CM != CM_MODULAR */

#ifdef DEBUG
  /* Check consistency with read_only attribute. */
  assert(!tx->attr.read_only);
#endif /* DEBUG */

#if DESIGN == WRITE_BACK_ETL
  w = stm_wbetl_write(tx, addr, value, mask);
#elif DESIGN == WRITE_BACK_CTL
  w = stm_wbctl_write(tx, addr, value, mask);
#elif DESIGN == WRITE_THROUGH
  w = stm_wt_write(tx, addr, value, mask);
#elif DESIGN == MODULAR
  if (tx->attr.id == WRITE_BACK_CTL)
    w = stm_wbctl_write(tx, addr, value, mask);
  else if (tx->attr.id == WRITE_THROUGH)
    w = stm_wt_write(tx, addr, value, mask);
  else
    w = stm_wbetl_write(tx, addr, value, mask);
#endif /* DESIGN == WRITE_THROUGH */

  return w;
}

static INLINE stm_word_t
int_stm_RaR(stm_tx_t *tx, volatile stm_word_t *addr)
{
  stm_word_t value;
#if DESIGN == WRITE_BACK_ETL
  value = stm_wbetl_RaR(tx, addr);
#elif DESIGN == WRITE_BACK_CTL
  value = stm_wbctl_RaR(tx, addr);
#elif DESIGN == WRITE_THROUGH
  value = stm_wt_RaR(tx, addr);
#endif /* DESIGN == WRITE_THROUGH */
  return value;
}

static INLINE stm_word_t
int_stm_RaW(stm_tx_t *tx, volatile stm_word_t *addr)
{
  stm_word_t value;
#if DESIGN == WRITE_BACK_ETL
  value = stm_wbetl_RaW(tx, addr);
#elif DESIGN == WRITE_BACK_CTL
  value = stm_wbctl_RaW(tx, addr);
#elif DESIGN == WRITE_THROUGH
  value = stm_wt_RaW(tx, addr);
#endif /* DESIGN == WRITE_THROUGH */
  return value;
}

static INLINE stm_word_t
int_stm_RfW(stm_tx_t *tx, volatile stm_word_t *addr)
{
  stm_word_t value;
#if DESIGN == WRITE_BACK_ETL
  value = stm_wbetl_RfW(tx, addr);
#elif DESIGN == WRITE_BACK_CTL
  value = stm_wbctl_RfW(tx, addr);
#elif DESIGN == WRITE_THROUGH
  value = stm_wt_RfW(tx, addr);
#endif /* DESIGN == WRITE_THROUGH */
  return value;
}

static INLINE void
int_stm_WaR(stm_tx_t *tx, volatile stm_word_t *addr, stm_word_t value, stm_word_t mask)
{
#if DESIGN == WRITE_BACK_ETL
  stm_wbetl_WaR(tx, addr, value, mask);
#elif DESIGN == WRITE_BACK_CTL
  stm_wbctl_WaR(tx, addr, value, mask);
#elif DESIGN == WRITE_THROUGH
  stm_wt_WaR(tx, addr, value, mask);
#endif /* DESIGN == WRITE_THROUGH */
}

static INLINE void
int_stm_WaW(stm_tx_t *tx, volatile stm_word_t *addr, stm_word_t value, stm_word_t mask)
{
#if DESIGN == WRITE_BACK_ETL
  stm_wbetl_WaW(tx, addr, value, mask);
#elif DESIGN == WRITE_BACK_CTL
  stm_wbctl_WaW(tx, addr, value, mask);
#elif DESIGN == WRITE_THROUGH
  stm_wt_WaW(tx, addr, value, mask);
#endif /* DESIGN == WRITE_THROUGH */
}

static INLINE stm_tx_t *
int_stm_init_thread(void)
{
  stm_tx_t *tx;

  PRINT_DEBUG("==> stm_init_thread()\n");

  /* Avoid initializing more than once */
  if ((tx = tls_get_tx()) != NULL)
    return tx;

#ifdef EPOCH_GC
  gc_init_thread();
#endif /* EPOCH_GC */

  /* Allocate descriptor */
  tx = (stm_tx_t *)xmalloc_aligned(sizeof(stm_tx_t));
  /* Set attribute */
  tx->attr = (stm_tx_attr_t)0;
  /* Set status (no need for CAS or atomic op) */
  tx->status = TX_IDLE;
  /* Read set */
  tx->r_set.nb_entries = 0;
  tx->r_set.size = RW_SET_SIZE;
  stm_allocate_rs_entries(tx, 0);
  /* Write set */
  tx->w_set.nb_entries = 0;
  tx->w_set.size = RW_SET_SIZE;
  /* has_writes / nb_acquired are the same field. */
  tx->w_set.has_writes = 0;
  /* tx->w_set.nb_acquired = 0; */
#ifdef USE_BLOOM_FILTER
  tx->w_set.bloom = 0;
#endif /* USE_BLOOM_FILTER */
  stm_allocate_ws_entries(tx, 0);
  /* Nesting level */
  tx->nesting = 0;
  /* Transaction-specific data */
  memset(tx->data, 0, MAX_SPECIFIC * sizeof(void *));
#ifdef CONFLICT_TRACKING
  /* Thread identifier */
  tx->thread_id = pthread_self();
#endif /* CONFLICT_TRACKING */
#if CM == CM_DELAY || CM == CM_MODULAR
  /* Contented lock */
  tx->c_lock = NULL;
#endif /* CM == CM_DELAY || CM == CM_MODULAR */
#if CM == CM_BACKOFF
  /* Backoff */
  tx->backoff = MIN_BACKOFF;
  tx->seed = 123456789UL;
#endif /* CM == CM_BACKOFF */
#if CM == CM_MODULAR
  tx->visible_reads = 0;
  tx->timestamp = 0;
#endif /* CM == CM_MODULAR */
#if CM == CM_MODULAR || defined(TM_STATISTICS)
  tx->stat_retries = 0;
#endif /* CM == CM_MODULAR || defined(TM_STATISTICS) */
#ifdef TM_STATISTICS
  /* Statistics */
  tx->stat_commits = 0;
  tx->stat_aborts = 0;
  tx->stat_retries_max = 0;
#endif /* TM_STATISTICS */
#ifdef TM_STATISTICS2
  tx->stat_aborts_1 = 0;
  tx->stat_aborts_2 = 0;
  memset(tx->stat_aborts_r, 0, sizeof(unsigned int) * 16);
# ifdef READ_LOCKED_DATA
  tx->stat_locked_reads_ok = 0;
  tx->stat_locked_reads_failed = 0;
# endif /* READ_LOCKED_DATA */
#endif /* TM_STATISTICS2 */
#ifdef IRREVOCABLE_ENABLED
  tx->irrevocable = 0;
#endif /* IRREVOCABLE_ENABLED */
  /* Store as thread-local data */
  tls_set_tx(tx);
  stm_quiesce_enter_thread(tx);

  /* Callbacks */
  if (likely(_tinystm.nb_init_cb != 0)) {
    unsigned int cb;
    for (cb = 0; cb < _tinystm.nb_init_cb; cb++)
      _tinystm.init_cb[cb].f(_tinystm.init_cb[cb].arg);
  }

  return tx;
}

static INLINE void
int_stm_exit_thread(stm_tx_t *tx)
{
#ifdef EPOCH_GC
  stm_word_t t;
#endif /* EPOCH_GC */

  PRINT_DEBUG("==> stm_exit_thread(%p[%lu-%lu])\n", tx, (unsigned long)tx->start, (unsigned long)tx->end);

  /* Avoid finalizing again a thread */
  if (tx == NULL)
    return;

  /* Callbacks */
  if (likely(_tinystm.nb_exit_cb != 0)) {
    unsigned int cb;
    for (cb = 0; cb < _tinystm.nb_exit_cb; cb++)
      _tinystm.exit_cb[cb].f(_tinystm.exit_cb[cb].arg);
  }

#ifdef TM_STATISTICS
  /* Display statistics before to lose it */
  if (getenv("TM_STATISTICS") != NULL) {
    double avg_aborts = .0;
    if (tx->stat_commits)
      avg_aborts = (double)tx->stat_aborts / tx->stat_commits;
    printf("Thread %p | commits:%12u avg_aborts:%12.2f max_retries:%12u\n", (void *)pthread_self(), tx->stat_commits, avg_aborts, tx->stat_retries_max);
  }
#endif /* TM_STATISTICS */

  stm_quiesce_exit_thread(tx);

#ifdef EPOCH_GC
  t = GET_CLOCK;
  gc_free(tx->r_set.entries, t);
  gc_free(tx->w_set.entries, t);
  gc_free(tx, t);
  gc_exit_thread();
#else /* ! EPOCH_GC */
  xfree(tx->r_set.entries);
  xfree(tx->w_set.entries);
  xfree(tx);
#endif /* ! EPOCH_GC */

  tls_set_tx(NULL);
}

static INLINE sigjmp_buf *
int_stm_start(stm_tx_t *tx, stm_tx_attr_t attr)
{
  PRINT_DEBUG("==> stm_start(%p)\n", tx);

  /* TODO Nested transaction attributes are not checked if they are coherent
   * with parent ones.  */

  /* Increment nesting level */
  if (tx->nesting++ > 0)
    return NULL;

  /* Attributes */
  tx->attr = attr;

  /* Initialize transaction descriptor */
  int_stm_prepare(tx);

  /* Callbacks */
  if (likely(_tinystm.nb_start_cb != 0)) {
    unsigned int cb;
    for (cb = 0; cb < _tinystm.nb_start_cb; cb++)
      _tinystm.start_cb[cb].f(_tinystm.start_cb[cb].arg);
  }

  return &tx->env;
}

static INLINE int
int_stm_commit(stm_tx_t *tx)
{
#if CM == CM_MODULAR
  stm_word_t t;
#endif /* CM == CM_MODULAR */

  PRINT_DEBUG("==> stm_commit(%p[%lu-%lu])\n", tx, (unsigned long)tx->start, (unsigned long)tx->end);

  /* Decrement nesting level */
  if (unlikely(--tx->nesting > 0))
    return 1;

  /* Callbacks */
  if (unlikely(_tinystm.nb_precommit_cb != 0)) {
    unsigned int cb;
    for (cb = 0; cb < _tinystm.nb_precommit_cb; cb++)
      _tinystm.precommit_cb[cb].f(_tinystm.precommit_cb[cb].arg);
  }

  assert(IS_ACTIVE(tx->status));

#if CM == CM_MODULAR
  /* Set status to COMMITTING */
  t = tx->status;
  if (GET_STATUS(t) == TX_KILLED || ATOMIC_CAS_FULL(&tx->status, t, t + (TX_COMMITTING - GET_STATUS(t))) == 0) {
    /* We have been killed */
    assert(GET_STATUS(tx->status) == TX_KILLED);
    stm_rollback(tx, STM_ABORT_KILLED);
    return 0;
  }
#endif /* CM == CM_MODULAR */

  /* A read-only transaction can commit immediately */
  if (unlikely(tx->w_set.nb_entries == 0))
    goto end;

  /* Update transaction */
#if DESIGN == WRITE_BACK_ETL
  stm_wbetl_commit(tx);
#elif DESIGN == WRITE_BACK_CTL
  stm_wbctl_commit(tx);
#elif DESIGN == WRITE_THROUGH
  stm_wt_commit(tx);
#elif DESIGN == MODULAR
  if (tx->attr.id == WRITE_BACK_CTL)
    stm_wbctl_commit(tx);
  else if (tx->attr.id == WRITE_THROUGH)
    stm_wt_commit(tx);
  else
    stm_wbetl_commit(tx);
#endif /* DESIGN == MODULAR */

 end:
#ifdef TM_STATISTICS
  tx->stat_commits++;
#endif /* TM_STATISTICS */
#if CM == CM_MODULAR || defined(TM_STATISTICS)
  tx->stat_retries = 0;
#endif /* CM == CM_MODULAR || defined(TM_STATISTICS) */

#if CM == CM_BACKOFF
  /* Reset backoff */
  tx->backoff = MIN_BACKOFF;
#endif /* CM == CM_BACKOFF */

#if CM == CM_MODULAR
  tx->visible_reads = 0;
#endif /* CM == CM_MODULAR */

#ifdef IRREVOCABLE_ENABLED
  if (unlikely(tx->irrevocable)) {
    ATOMIC_STORE(&_tinystm.irrevocable, 0);
    if ((tx->irrevocable & 0x08) != 0)
      stm_quiesce_release(tx);
    tx->irrevocable = 0;
  }
#endif /* IRREVOCABLE_ENABLED */

  /* Set status to COMMITTED */
  SET_STATUS(tx->status, TX_COMMITTED);

  /* Callbacks */
  if (likely(_tinystm.nb_commit_cb != 0)) {
    unsigned int cb;
    for (cb = 0; cb < _tinystm.nb_commit_cb; cb++)
      _tinystm.commit_cb[cb].f(_tinystm.commit_cb[cb].arg);
  }

  return 1;
}

static INLINE stm_word_t
int_stm_load(stm_tx_t *tx, volatile stm_word_t *addr)
{
#if DESIGN == WRITE_BACK_ETL
  return stm_wbetl_read(tx, addr);
#elif DESIGN == WRITE_BACK_CTL
  return stm_wbctl_read(tx, addr);
#elif DESIGN == WRITE_THROUGH
  return stm_wt_read(tx, addr);
#elif DESIGN == MODULAR
  if (tx->attr.id == WRITE_BACK_CTL)
    return stm_wbctl_read(tx, addr);
  else if (tx->attr.id == WRITE_THROUGH)
    return stm_wt_read(tx, addr);
  else
    return stm_wbetl_read(tx, addr);
#endif /* DESIGN == MODULAR */
}

static INLINE void
int_stm_store(stm_tx_t *tx, volatile stm_word_t *addr, stm_word_t value)
{
  stm_write(tx, addr, value, ~(stm_word_t)0);
}

static INLINE void
int_stm_store2(stm_tx_t *tx, volatile stm_word_t *addr, stm_word_t value, stm_word_t mask)
{
  stm_write(tx, addr, value, mask);
}

static INLINE int
int_stm_active(stm_tx_t *tx)
{
  assert (tx != NULL);
  return IS_ACTIVE(tx->status);
}

static INLINE int
int_stm_aborted(stm_tx_t *tx)
{
  assert (tx != NULL);
  return (GET_STATUS(tx->status) == TX_ABORTED);
}

static INLINE int
int_stm_irrevocable(stm_tx_t *tx)
{
  assert (tx != NULL);
#ifdef IRREVOCABLE_ENABLED
  return ((tx->irrevocable & 0x07) == 3);
#else /* ! IRREVOCABLE_ENABLED */
  return 0;
#endif /* ! IRREVOCABLE_ENABLED */
}

static INLINE int
int_stm_killed(stm_tx_t *tx)
{
  assert (tx != NULL);
  return (GET_STATUS(tx->status) == TX_KILLED);
}

static INLINE sigjmp_buf *
int_stm_get_env(stm_tx_t *tx)
{
  assert (tx != NULL);
  /* Only return environment for top-level transaction */
  return tx->nesting == 0 ? &tx->env : NULL;
}

static INLINE int
int_stm_get_stats(stm_tx_t *tx, const char *name, void *val)
{
  assert (tx != NULL);

  if (strcmp("read_set_size", name) == 0) {
    *(unsigned int *)val = tx->r_set.size;
    return 1;
  }
  if (strcmp("write_set_size", name) == 0) {
    *(unsigned int *)val = tx->w_set.size;
    return 1;
  }
  if (strcmp("read_set_nb_entries", name) == 0) {
    *(unsigned int *)val = tx->r_set.nb_entries;
    return 1;
  }
  if (strcmp("write_set_nb_entries", name) == 0) {
    *(unsigned int *)val = tx->w_set.nb_entries;
    return 1;
  }
  if (strcmp("read_only", name) == 0) {
    *(unsigned int *)val = tx->attr.read_only;
    return 1;
  }
#ifdef TM_STATISTICS
  if (strcmp("nb_commits", name) == 0) {
    *(unsigned int *)val = tx->stat_commits;
    return 1;
  }
  if (strcmp("nb_aborts", name) == 0) {
    *(unsigned int *)val = tx->stat_aborts;
    return 1;
  }
  if (strcmp("avg_aborts", name) == 0) {
    *(unsigned int *)val = tx->stat_aborts / tx->stat_commits;
    return 1;
  }
  if (strcmp("max_retries", name) == 0) {
    *(unsigned int *)val = tx->stat_retries_max;
    return 1;
  }
#endif /* TM_STATISTICS */
#ifdef TM_STATISTICS2
  if (strcmp("nb_aborts_1", name) == 0) {
    *(unsigned int *)val = tx->stat_aborts_1;
    return 1;
  }
  if (strcmp("nb_aborts_2", name) == 0) {
    *(unsigned int *)val = tx->stat_aborts_2;
    return 1;
  }
  if (strcmp("nb_aborts_locked_read", name) == 0) {
    *(unsigned int *)val = tx->stat_aborts_r[STM_ABORT_WR_CONFLICT >> 8];
    return 1;
  }
  if (strcmp("nb_aborts_locked_write", name) == 0) {
    *(unsigned int *)val = tx->stat_aborts_r[STM_ABORT_WW_CONFLICT >> 8];
    return 1;
  }
  if (strcmp("nb_aborts_validate_read", name) == 0) {
    *(unsigned int *)val = tx->stat_aborts_r[STM_ABORT_VAL_READ >> 8];
    return 1;
  }
  if (strcmp("nb_aborts_validate_write", name) == 0) {
    *(unsigned int *)val = tx->stat_aborts_r[STM_ABORT_VAL_WRITE >> 8];
    return 1;
  }
  if (strcmp("nb_aborts_validate_commit", name) == 0) {
    *(unsigned int *)val = tx->stat_aborts_r[STM_ABORT_VALIDATE >> 8];
    return 1;
  }
  if (strcmp("nb_aborts_killed", name) == 0) {
    *(unsigned int *)val = tx->stat_aborts_r[STM_ABORT_KILLED >> 8];
    return 1;
  }
  if (strcmp("nb_aborts_invalid_memory", name) == 0) {
    *(unsigned int *)val = tx->stat_aborts_r[STM_ABORT_SIGNAL >> 8];
    return 1;
  }
# ifdef READ_LOCKED_DATA
  if (strcmp("locked_reads_ok", name) == 0) {
    *(unsigned int *)val = tx->stat_locked_reads_ok;
    return 1;
  }
  if (strcmp("locked_reads_failed", name) == 0) {
    *(unsigned int *)val = tx->stat_locked_reads_failed;
    return 1;
  }
# endif /* READ_LOCKED_DATA */
#endif /* TM_STATISTICS2 */
  return 0;
}

static INLINE void
int_stm_set_specific(stm_tx_t *tx, int key, void *data)
{
  assert (tx != NULL && key >= 0 && key < _tinystm.nb_specific);
  ATOMIC_STORE(&tx->data[key], data);
}

static INLINE void *
int_stm_get_specific(stm_tx_t *tx, int key)
{
  assert (tx != NULL && key >= 0 && key < _tinystm.nb_specific);
  return (void *)ATOMIC_LOAD(&tx->data[key]);
}

#endif /* _STM_INTERNAL_H_ */


================================================
FILE: stms/tinystm/src/stm_wbctl.h
================================================
/*
 * File:
 *   stm_wbctl.h
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * Description:
 *   STM internal functions for write-back CTL.
 *
 * Copyright (c) 2007-2014.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * This program has a dual license and can also be distributed
 * under the terms of the MIT license.
 */

#ifndef _STM_WBCTL_H_
#define _STM_WBCTL_H_

static INLINE int
stm_wbctl_validate(stm_tx_t *tx)
{
  r_entry_t *r;
  int i;
  stm_word_t l;

  PRINT_DEBUG("==> stm_wbctl_validate(%p[%lu-%lu])\n", tx, (unsigned long)tx->start, (unsigned long)tx->end);

  /* Validate reads */
  r = tx->r_set.entries;
  for (i = tx->r_set.nb_entries; i > 0; i--, r++) {
    /* Read lock */
    l = ATOMIC_LOAD(r->lock);
    /* Unlocked and still the same version? */
    if (LOCK_GET_OWNED(l)) {
      /* Do we own the lock? */
      w_entry_t *w = (w_entry_t *)LOCK_GET_ADDR(l);
      /* Simply check if address falls inside our write set (avoids non-faulting load) */
      if (!(tx->w_set.entries <= w && w < tx->w_set.entries + tx->w_set.nb_entries))
      {
        /* Locked by another transaction: cannot validate */
#ifdef CONFLICT_TRACKING
        if (_tinystm.conflict_cb != NULL) {
# ifdef UNIT_TX
          if (l != LOCK_UNIT) {
# endif /* UNIT_TX */
            /* Call conflict callback */
            stm_tx_t *other = ((w_entry_t *)LOCK_GET_ADDR(l))->tx;
            _tinystm.conflict_cb(tx, other);
# ifdef UNIT_TX
          }
# endif /* UNIT_TX */
        }
#endif /* CONFLICT_TRACKING */
        return 0;
      }
      /* We own the lock: OK */
      if (w->version != r->version) {
        /* Other version: cannot validate */
        return 0;
      }
    } else {
      if (LOCK_GET_TIMESTAMP(l) != r->version) {
        /* Other version: cannot validate */
        return 0;
      }
      /* Same version: OK */
    }
  }
  return 1;
}

/*
 * Extend snapshot range.
 */
static INLINE int
stm_wbctl_extend(stm_tx_t *tx)
{
  stm_word_t now;

  PRINT_DEBUG("==> stm_wbctl_extend(%p[%lu-%lu])\n", tx, (unsigned long)tx->start, (unsigned long)tx->end);

#ifdef UNIT_TX
  /* Extension is disabled */
  if (tx->attr.no_extend)
    return 0;
#endif /* UNIT_TX */

  /* Get current time */
  now = GET_CLOCK;
  /* No need to check clock overflow here. The clock can exceed up to MAX_THREADS and it will be reset when the quiescence is reached. */

  /* Try to validate read set */
  if (stm_wbctl_validate(tx)) {
    /* It works: we can extend until now */
    tx->end = now;
    return 1;
  }
  return 0;
}

static INLINE void
stm_wbctl_rollback(stm_tx_t *tx)
{
  w_entry_t *w;

  PRINT_DEBUG("==> stm_wbctl_rollback(%p[%lu-%lu])\n", tx, (unsigned long)tx->start, (unsigned long)tx->end);

  assert(IS_ACTIVE(tx->status));

  if (tx->w_set.nb_acquired > 0) {
    w = tx->w_set.entries + tx->w_set.nb_entries;
    do {
      w--;
      if (!w->no_drop) {
        if (--tx->w_set.nb_acquired == 0) {
          /* Make sure that all lock releases become visible to other threads */
          ATOMIC_STORE_REL(w->lock, LOCK_SET_TIMESTAMP(w->version));
        } else {
          ATOMIC_STORE(w->lock, LOCK_SET_TIMESTAMP(w->version));
        }
      }
    } while (tx->w_set.nb_acquired > 0);
  }
}

static INLINE stm_word_t
stm_wbctl_read(stm_tx_t *tx, volatile stm_word_t *addr)
{
  volatile stm_word_t *lock;
  stm_word_t l, l2, value, version;
  r_entry_t *r;
  w_entry_t *written = NULL;

  PRINT_DEBUG2("==> stm_wbctl_read(t=%p[%lu-%lu],a=%p)\n", tx, (unsigned long)tx->start, (unsigned long)tx->end, addr);

  assert(IS_ACTIVE(tx->status));

  /* Did we previously write the same address? */
  written = stm_has_written(tx, addr);
  if (written != NULL) {
    /* Yes: get value from write set if possible */
    if (written->mask == ~(stm_word_t)0) {
      value = written->value;
      /* No need to add to read set */
      return value;
    }
  }

  /* Get reference to lock */
  lock = GET_LOCK(addr);

  /* Note: we could check for duplicate reads and get value from read set */

  /* Read lock, value, lock */
 restart:
  l = ATOMIC_LOAD_ACQ(lock);
 restart_no_load:
  if (LOCK_GET_WRITE(l)) {
    /* Locked */
    /* Do we own the lock? */
    /* Spin while locked (should not last long) */
    goto restart;
  } else {
    /* Not locked */
    value = ATOMIC_LOAD_ACQ(addr);
    l2 = ATOMIC_LOAD_ACQ(lock);
    if (l != l2) {
      l = l2;
      goto restart_no_load;
    }
#ifdef IRREVOCABLE_ENABLED
    /* In irrevocable mode, no need check timestamp nor add entry to read set */
    if (tx->irrevocable)
      goto return_value;
#endif /* IRREVOCABLE_ENABLED */
    /* Check timestamp */
    version = LOCK_GET_TIMESTAMP(l);
    /* Valid version? */
    if (version > tx->end) {
      /* No: try to extend first (except for read-only transactions: no read set) */
      if (tx->attr.read_only || !stm_wbctl_extend(tx)) {
        /* Not much we can do: abort */
        stm_rollback(tx, STM_ABORT_VAL_READ);
        return 0;
      }
      /* Verify that version has not been overwritten (read value has not
       * yet been added to read set and may have not been checked during
       * extend) */
      l = ATOMIC_LOAD_ACQ(lock);
      if (l != l2) {
        l = l2;
        goto restart_no_load;
      }
      /* Worked: we now have a good version (version <= tx->end) */
    }
  }
  /* We have a good version: add to read set (update transactions) and return value */

  /* Did we previously write the same address? */
  if (written != NULL) {
    value = (value & ~written->mask) | (written->value & written->mask);
    /* Must still add to read set */
  }
#ifdef READ_LOCKED_DATA
 add_to_read_set:
#endif /* READ_LOCKED_DATA */
  if (!tx->attr.read_only) {
#ifdef NO_DUPLICATES_IN_RW_SETS
    if (stm_has_read(tx, lock) != NULL)
      goto return_value;
#endif /* NO_DUPLICATES_IN_RW_SETS */
    /* Add address and version to read set */
    if (tx->r_set.nb_entries == tx->r_set.size)
      stm_allocate_rs_entries(tx, 1);
    r = &tx->r_set.entries[tx->r_set.nb_entries++];
    r->version = version;
    r->lock = lock;
  }
 return_value:
  return value;
}

static INLINE w_entry_t *
stm_wbctl_write(stm_tx_t *tx, volatile stm_word_t *addr, stm_word_t value, stm_word_t mask)
{
  volatile stm_word_t *lock;
  stm_word_t l, version;
  w_entry_t *w;

  PRINT_DEBUG2("==> stm_wbctl_write(t=%p[%lu-%lu],a=%p,d=%p-%lu,m=0x%lx)\n",
               tx, (unsigned long)tx->start, (unsigned long)tx->end, addr, (void *)value, (unsigned long)value, (unsigned long)mask);

  /* Get reference to lock */
  lock = GET_LOCK(addr);

  /* Try to acquire lock */
 restart:
  l = ATOMIC_LOAD_ACQ(lock);
 restart_no_load:
  if (LOCK_GET_OWNED(l)) {
    /* Locked */
    /* Spin while locked (should not last long) */
    goto restart;
  }
  /* Not locked */
  w = stm_has_written(tx, addr);
  if (w != NULL) {
    w->value = (w->value & ~mask) | (value & mask);
    w->mask |= mask;
    return w;
  }
  /* Handle write after reads (before CAS) */
  version = LOCK_GET_TIMESTAMP(l);
#ifdef IRREVOCABLE_ENABLED
  /* In irrevocable mode, no need to revalidate */
  if (tx->irrevocable)
    goto acquire_no_check;
#endif /* IRREVOCABLE_ENABLED */
 acquire:
  if (version > tx->end) {
    /* We might have read an older version previously */
#ifdef UNIT_TX
    if (tx->attr.no_extend) {
      stm_rollback(tx, STM_ABORT_VAL_WRITE);
      return NULL;
    }
#endif /* UNIT_TX */
    if (stm_has_read(tx, lock) != NULL) {
      /* Read version must be older (otherwise, tx->end >= version) */
      /* Not much we can do: abort */
      stm_rollback(tx, STM_ABORT_VAL_WRITE);
      return NULL;
    }
  }
  /* Acquire lock (ETL) */
#ifdef IRREVOCABLE_ENABLED
 acquire_no_check:
#endif /* IRREVOCABLE_ENABLED */
  /* We own the lock here (ETL) */
do_write:
  /* Add address to write set */
  if (tx->w_set.nb_entries == tx->w_set.size)
    stm_allocate_ws_entries(tx, 1);
  w = &tx->w_set.entries[tx->w_set.nb_entries++];
  w->addr = addr;
  w->mask = mask;
  w->lock = lock;
  if (mask == 0) {
    /* Do not write anything */
#ifndef NDEBUG
    w->value = 0;
#endif /* ! NDEBUG */
  } else {
    /* Remember new value */
    w->value = value;
  }
# ifndef NDEBUG
  w->version = version;
# endif /* !NDEBUG */
  w->no_drop = 1;
# ifdef USE_BLOOM_FILTER
  tx->w_set.bloom |= FILTER_BITS(addr) ;
# endif /* USE_BLOOM_FILTER */

  return w;
}

static INLINE stm_word_t
stm_wbctl_RaR(stm_tx_t *tx, volatile stm_word_t *addr)
{
  /* Possible optimization: avoid adding to read set. */
  return stm_wbctl_read(tx, addr);
}

static INLINE stm_word_t
stm_wbctl_RaW(stm_tx_t *tx, volatile stm_word_t *addr)
{
  /* Cannot be much better than regular due to mask == 0 case. */
  return stm_wbctl_read(tx, addr);
}

static INLINE stm_word_t
stm_wbctl_RfW(stm_tx_t *tx, volatile stm_word_t *addr)
{
  /* We need to return the value here, so write with mask=0 is not enough. */
  return stm_wbctl_read(tx, addr);
}

static INLINE void
stm_wbctl_WaR(stm_tx_t *tx, volatile stm_word_t *addr, stm_word_t value, stm_word_t mask)
{
  /* Probably no optimization can be done here. */
  stm_wbctl_write(tx, addr, value, mask);
}

static INLINE void
stm_wbctl_WaW(stm_tx_t *tx, volatile stm_word_t *addr, stm_word_t value, stm_word_t mask)
{
  w_entry_t *w;
  /* Get the write set entry. */
  w = stm_has_written(tx, addr);
  assert(w != NULL);
  /* Update directly into the write set. */
  w->value = (w->value & ~mask) | (value & mask);
  w->mask |= mask;
}

static INLINE int
stm_wbctl_commit(stm_tx_t *tx)
{
  w_entry_t *w;
  stm_word_t t;
  int i;
  stm_word_t l, value;

  PRINT_DEBUG("==> stm_wbctl_commit(%p[%lu-%lu])\n", tx, (unsigned long)tx->start, (unsigned long)tx->end);

  /* Acquire locks (in reverse order) */
  w = tx->w_set.entries + tx->w_set.nb_entries;
  do {
    w--;
    /* Try to acquire lock */
 restart:
    l = ATOMIC_LOAD(w->lock);
    if (LOCK_GET_OWNED(l)) {
      /* Do we already own the lock? */
      if (tx->w_set.entries <= (w_entry_t *)LOCK_GET_ADDR(l) && (w_entry_t *)LOCK_GET_ADDR(l) < tx->w_set.entries + tx->w_set.nb_entries) {
        /* Yes: ignore */
        continue;
      }
      /* Conflict: CM kicks in */
# if CM == CM_DELAY
      tx->c_lock = w->lock;
# endif /* CM == CM_DELAY */

#ifdef IRREVOCABLE_ENABLED
      if (tx->irrevocable) {
# ifdef IRREVOCABLE_IMPROVED
        if (ATOMIC_LOAD(&_tinystm.irrevocable) == 1)
          ATOMIC_STORE(&_tinystm.irrevocable, 2);
# endif /* IRREVOCABLE_IMPROVED */
        /* Spin while locked */
        goto restart;
      }
#endif /* IRREVOCABLE_ENABLED */

      /* Abort self */
      stm_rollback(tx, STM_ABORT_WW_CONFLICT);
      return 0;
    }
    if (ATOMIC_CAS_FULL(w->lock, l, LOCK_SET_ADDR_WRITE((stm_word_t)w)) == 0)
      goto restart;
    /* We own the lock here */
    w->no_drop = 0;
    /* Store version for validation of read set */
    w->version = LOCK_GET_TIMESTAMP(l);
    tx->w_set.nb_acquired++;
  } while (w > tx->w_set.entries);

#ifdef IRREVOCABLE_ENABLED
  /* Verify if there is an irrevocable transaction once all locks have been acquired */
# ifdef IRREVOCABLE_IMPROVED
  /* FIXME: it is bogus. the status should be changed to idle otherwise stm_quiesce will not progress */
  if (unlikely(!tx->irrevocable)) {
    do {
      t = ATOMIC_LOAD(&_tinystm.irrevocable);
      /* If the irrevocable transaction have encountered an acquired lock, abort */
      if (t == 2) {
        stm_rollback(tx, STM_ABORT_IRREVOCABLE);
        return 0;
      }
    } while (t);
  }
# else /* ! IRREVOCABLE_IMPROVED */
  if (!tx->irrevocable && ATOMIC_LOAD(&_tinystm.irrevocable)) {
    stm_rollback(tx, STM_ABORT_IRREVOCABLE);
    return 0;
  }
# endif /* ! IRREVOCABLE_IMPROVED */
#endif /* IRREVOCABLE_ENABLED */

  /* Get commit timestamp (may exceed VERSION_MAX by up to MAX_THREADS) */
  t = FETCH_INC_CLOCK + 1;

#ifdef IRREVOCABLE_ENABLED
  if (unlikely(tx->irrevocable))
    goto release_locks;
#endif /* IRREVOCABLE_ENABLED */

  /* Try to validate (only if a concurrent transaction has committed since tx->start) */
  if (unlikely(tx->start != t - 1 && !stm_wbctl_validate(tx))) {
    /* Cannot commit */
    stm_rollback(tx, STM_ABORT_VALIDATE);
    return 0;
  }

#ifdef IRREVOCABLE_ENABLED
  release_locks:
#endif /* IRREVOCABLE_ENABLED */

  /* Install new versions, drop locks and set new timestamp */
  w = tx->w_set.entries;
  for (i = tx->w_set.nb_entries; i > 0; i--, w++) {
    if (w->mask == ~(stm_word_t)0) {
      ATOMIC_STORE(w->addr, w->value);
    } else if (w->mask != 0) {
      value = (ATOMIC_LOAD(w->addr) & ~w->mask) | (w->value & w->mask);
      ATOMIC_STORE(w->addr, value);
    }
    /* Only drop lock for last covered address in write set (cannot be "no drop") */
    if (!w->no_drop)
      ATOMIC_STORE_REL(w->lock, LOCK_SET_TIMESTAMP(t));
  }

 end:
  return 1;
}

#endif /* _STM_WBCTL_H_ */


================================================
FILE: stms/tinystm/src/stm_wbetl.h
================================================
/*
 * File:
 *   stm_wbetl.h
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * Description:
 *   STM internal functions for Write-back ETL.
 *
 * Copyright (c) 2007-2014.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * This program has a dual license and can also be distributed
 * under the terms of the MIT license.
 */

#ifndef _STM_WBETL_H_
#define _STM_WBETL_H_

#include "stm_internal.h"
#include "atomic.h"

#if CM == CM_MODULAR
/* Function declaration */
static NOINLINE void stm_drop(stm_tx_t *tx);
static NOINLINE int stm_kill(stm_tx_t *tx, stm_tx_t *other, stm_word_t status);
#endif /* CM == CM_MODULAR */

static INLINE int
stm_wbetl_validate(stm_tx_t *tx)
{
  r_entry_t *r;
  int i;
  stm_word_t l;

  PRINT_DEBUG("==> stm_wbetl_validate(%p[%lu-%lu])\n", tx, (unsigned long)tx->start, (unsigned long)tx->end);

  /* Validate reads */
  r = tx->r_set.entries;
  for (i = tx->r_set.nb_entries; i > 0; i--, r++) {
    /* Read lock */
    l = ATOMIC_LOAD(r->lock);
    /* Unlocked and still the same version? */
    if (LOCK_GET_OWNED(l)) {
      /* Do we own the lock? */
      w_entry_t *w = (w_entry_t *)LOCK_GET_ADDR(l);
      /* Simply check if address falls inside our write set (avoids non-faulting load) */
      if (!(tx->w_set.entries <= w && w < tx->w_set.entries + tx->w_set.nb_entries))
      {
        /* Locked by another transaction: cannot validate */
#ifdef CONFLICT_TRACKING
        if (_tinystm.conflict_cb != NULL) {
# ifdef UNIT_TX
          if (l != LOCK_UNIT) {
# endif /* UNIT_TX */
            /* Call conflict callback */
            stm_tx_t *other = ((w_entry_t *)LOCK_GET_ADDR(l))->tx;
            _tinystm.conflict_cb(tx, other);
# ifdef UNIT_TX
          }
# endif /* UNIT_TX */
        }
#endif /* CONFLICT_TRACKING */
        return 0;
      }
      /* We own the lock: OK */
    } else {
      if (LOCK_GET_TIMESTAMP(l) != r->version) {
        /* Other version: cannot validate */
        return 0;
      }
      /* Same version: OK */
    }
  }
  return 1;
}

/*
 * Extend snapshot range.
 */
static INLINE int
stm_wbetl_extend(stm_tx_t *tx)
{
  stm_word_t now;

  PRINT_DEBUG("==> stm_wbetl_extend(%p[%lu-%lu])\n", tx, (unsigned long)tx->start, (unsigned long)tx->end);

#ifdef UNIT_TX
  /* Extension is disabled */
  if (tx->attr.no_extend)
    return 0;
#endif /* UNIT_TX */

  /* Get current time */
  now = GET_CLOCK;
  /* No need to check clock overflow here. The clock can exceed up to MAX_THREADS and it will be reset when the quiescence is reached. */

  /* Try to validate read set */
  if (stm_wbetl_validate(tx)) {
    /* It works: we can extend until now */
    tx->end = now;
    return 1;
  }
  return 0;
}

static INLINE void
stm_wbetl_rollback(stm_tx_t *tx)
{
  w_entry_t *w;
  int i;
#if CM == CM_MODULAR
  stm_word_t t;
#endif /* CM == CM_MODULAR */
#if CM == CM_BACKOFF
  unsigned long wait;
  volatile int j;
#endif /* CM == CM_BACKOFF */

  PRINT_DEBUG("==> stm_wbetl_rollback(%p[%lu-%lu])\n", tx, (unsigned long)tx->start, (unsigned long)tx->end);

  assert(IS_ACTIVE(tx->status));
#if CM == CM_MODULAR
  /* Set status to ABORTING */
  t = tx->status;
  if (GET_STATUS(t) == TX_KILLED || (GET_STATUS(t) == TX_ACTIVE && ATOMIC_CAS_FULL(&tx->status, t, t + (TX_ABORTING - TX_ACTIVE)) == 0)) {
    /* We have been killed */
    assert(GET_STATUS(tx->status) == TX_KILLED);
    /* Release locks */
    stm_drop(tx);
    return;
  }
#endif /* CM == CM_MODULAR */

  /* Drop locks */
  i = tx->w_set.nb_entries;
  if (i > 0) {
    w = tx->w_set.entries;
    for (; i > 0; i--, w++) {
      if (w->next == NULL) {
        /* Only drop lock for last covered address in write set */
        ATOMIC_STORE(w->lock, LOCK_SET_TIMESTAMP(w->version));
      }
    }
    /* Make sure that all lock releases become visible */
    ATOMIC_MB_WRITE;
  }
}

/*
 * Load a word-sized value (invisible read).
 */
static INLINE stm_word_t
stm_wbetl_read_invisible(stm_tx_t *tx, volatile stm_word_t *addr)
{
  volatile stm_word_t *lock;
  stm_word_t l, l2, value, version;
  r_entry_t *r;
  w_entry_t *w;
#if CM == CM_MODULAR
  stm_word_t t;
  int decision;
#endif /* CM == CM_MODULAR */

  PRINT_DEBUG2("==> stm_wbetl_read_invisible(t=%p[%lu-%lu],a=%p)\n", tx, (unsigned long)tx->start, (unsigned long)tx->end, addr);

#if CM != CM_MODULAR
  assert(IS_ACTIVE(tx->status));
#endif /* CM != CM_MODULAR */

  /* Get reference to lock */
  lock = GET_LOCK(addr);

  /* Note: we could check for duplicate reads and get value from read set */

  /* Read lock, value, lock */
 restart:
  l = ATOMIC_LOAD_ACQ(lock);
 restart_no_load:
  if (unlikely(LOCK_GET_WRITE(l))) {
    /* Locked */
    /* Do we own the lock? */
    w = (w_entry_t *)LOCK_GET_ADDR(l);
    /* Simply check if address falls inside our write set (avoids non-faulting load) */
    if (likely(tx->w_set.entries <= w && w < tx->w_set.entries + tx->w_set.nb_entries)) {
      /* Yes: did we previously write the same address? */
      while (1) {
        if (addr == w->addr) {
          /* Yes: get value from write set (or from memory if mask was empty) */
          value = (w->mask == 0 ? ATOMIC_LOAD(addr) : w->value);
          break;
        }
        if (w->next == NULL) {
          /* No: get value from memory */
          value = ATOMIC_LOAD(addr);
# if CM == CM_MODULAR
          if (GET_STATUS(tx->status) == TX_KILLED) {
            stm_rollback(tx, STM_ABORT_KILLED);
            return 0;
          }
# endif /* CM == CM_MODULAR */
          break;
        }
        w = w->next;
      }
      /* No need to add to read set (will remain valid) */
      return value;
    }

# ifdef UNIT_TX
    if (l == LOCK_UNIT) {
      /* Data modified by a unit store: should not last long => retry */
      goto restart;
    }
# endif /* UNIT_TX */

    /* Conflict: CM kicks in (we could also check for duplicate reads and get value from read set) */
# if defined(IRREVOCABLE_ENABLED) && defined(IRREVOCABLE_IMPROVED)
    if (tx->irrevocable && ATOMIC_LOAD(&_tinystm.irrevocable) == 1)
      ATOMIC_STORE(&_tinystm.irrevocable, 2);
# endif /* defined(IRREVOCABLE_ENABLED) && defined(IRREVOCABLE_IMPROVED) */
# if CM != CM_MODULAR && defined(IRREVOCABLE_ENABLED)
    if (unlikely(tx->irrevocable)) {
      /* Spin while locked */
      goto restart;
    }
# endif /* CM != CM_MODULAR && defined(IRREVOCABLE_ENABLED) */
# if CM == CM_MODULAR
    t = w->tx->status;
    l2 = ATOMIC_LOAD_ACQ(lock);
    if (l != l2) {
      l = l2;
      goto restart_no_load;
    }
    if (t != w->tx->status) {
      /* Transaction status has changed: restart the whole procedure */
      goto restart;
    }
#  ifdef READ_LOCKED_DATA
#   ifdef IRREVOCABLE_ENABLED
    if (IS_ACTIVE(t) && !tx->irrevocable)
#   else /* ! IRREVOCABLE_ENABLED */
    if (GET_STATUS(t) == TX_ACTIVE)
#   endif /* ! IRREVOCABLE_ENABLED */
    {
      /* Read old version */
      version = ATOMIC_LOAD(&w->version);
      /* Read data */
      value = ATOMIC_LOAD(addr);
      /* Check that data has not been written */
      if (t != w->tx->status) {
        /* Data concurrently modified: a new version might be available => retry */
        goto restart;
      }
      if (version >= tx->start && (version <= tx->end || (!tx->attr.read_only && stm_wbetl_extend(tx)))) {
      /* Success */
#  ifdef TM_STATISTICS2
        tx->stat_locked_reads_ok++;
#  endif /* TM_STATISTICS2 */
        goto add_to_read_set;
      }
      /* Invalid version: not much we can do => fail */
#  ifdef TM_STATISTICS2
      tx->stat_locked_reads_failed++;
#  endif /* TM_STATISTICS2 */
    }
#  endif /* READ_LOCKED_DATA */
    if (GET_STATUS(t) == TX_KILLED) {
      /* We can safely steal lock */
      decision = KILL_OTHER;
    } else {
      decision =
#  ifdef IRREVOCABLE_ENABLED
        GET_STATUS(tx->status) == TX_IRREVOCABLE ? KILL_OTHER :
        GET_STATUS(t) == TX_IRREVOCABLE ? KILL_SELF :
#  endif /* IRREVOCABLE_ENABLED */
        GET_STATUS(tx->status) == TX_KILLED ? KILL_SELF :
        (_tinystm.contention_manager != NULL ? _tinystm.contention_manager(tx, w->tx, WR_CONFLICT) : KILL_SELF);
      if (decision == KILL_OTHER) {
        /* Kill other */
        if (!stm_kill(tx, w->tx, t)) {
          /* Transaction may have committed or aborted: retry */
          goto restart;
        }
      }
    }
    if (decision == KILL_OTHER) {
      /* Steal lock */
      l2 = LOCK_SET_TIMESTAMP(w->version);
      if (ATOMIC_CAS_FULL(lock, l, l2) == 0)
        goto restart;
      l = l2;
      goto restart_no_load;
    }
    /* Kill self */
    if ((decision & DELAY_RESTART) != 0)
      tx->c_lock = lock;
# elif CM == CM_DELAY
    tx->c_lock = lock;
# endif /* CM == CM_DELAY */
    /* Abort */
# ifdef CONFLICT_TRACKING
    if (_tinystm.conflict_cb != NULL) {
#  ifdef UNIT_TX
      if (l != LOCK_UNIT) {
#  endif /* UNIT_TX */
        /* Call conflict callback */
        stm_tx_t *other = ((w_entry_t *)LOCK_GET_ADDR(l))->tx;
        _tinystm.conflict_cb(tx, other);
#  ifdef UNIT_TX
      }
#  endif /* UNIT_TX */
    }
# endif /* CONFLICT_TRACKING */
    stm_rollback(tx, STM_ABORT_RW_CONFLICT);
    return 0;
  } else {
    /* Not locked */
    value = ATOMIC_LOAD_ACQ(addr);
    l2 = ATOMIC_LOAD_ACQ(lock);
    if (unlikely(l != l2)) {
      l = l2;
      goto restart_no_load;
    }
#ifdef IRREVOCABLE_ENABLED
    /* In irrevocable mode, no need check timestamp nor add entry to read set */
    if (unlikely(tx->irrevocable))
      goto return_value;
#endif /* IRREVOCABLE_ENABLED */
    /* Check timestamp */
#if CM == CM_MODULAR
    if (LOCK_GET_READ(l))
      version = ((w_entry_t *)LOCK_GET_ADDR(l))->version;
    else
      version = LOCK_GET_TIMESTAMP(l);
#else /* CM != CM_MODULAR */
    version = LOCK_GET_TIMESTAMP(l);
#endif /* CM != CM_MODULAR */
    /* Valid version? */
    if (unlikely(version > tx->end)) {
      /* No: try to extend first (except for read-only transactions: no read set) */
      if (tx->attr.read_only || !stm_wbetl_extend(tx)) {
        /* Not much we can do: abort */
#if CM == CM_MODULAR
        /* Abort caused by invisible reads */
        tx->visible_reads++;
#endif /* CM == CM_MODULAR */
        stm_rollback(tx, STM_ABORT_VAL_READ);
        return 0;
      }
      /* Verify that version has not been overwritten (read value has not
       * yet been added to read set and may have not been checked during
       * extend) */
      l2 = ATOMIC_LOAD_ACQ(lock);
      if (l != l2) {
        l = l2;
        goto restart_no_load;
      }
      /* Worked: we now have a good version (version <= tx->end) */
    }
#if CM == CM_MODULAR
    /* Check if killed (necessary to avoid possible race on read-after-write) */
    if (GET_STATUS(tx->status) == TX_KILLED) {
      stm_rollback(tx, STM_ABORT_KILLED);
      return 0;
    }
#endif /* CM == CM_MODULAR */
  }
  /* We have a good version: add to read set (update transactions) and return value */

#ifdef READ_LOCKED_DATA
 add_to_read_set:
#endif /* READ_LOCKED_DATA */
  if (!tx->attr.read_only) {
#ifdef NO_DUPLICATES_IN_RW_SETS
    if (stm_has_read(tx, lock) != NULL)
      goto return_value;
#endif /* NO_DUPLICATES_IN_RW_SETS */
    /* Add address and version to read set */
    if (tx->r_set.nb_entries == tx->r_set.size)
      stm_allocate_rs_entries(tx, 1);
    r = &tx->r_set.entries[tx->r_set.nb_entries++];
    r->version = version;
    r->lock = lock;
  }
 return_value:
  return value;
}

#if CM == CM_MODULAR
/*
 * Load a word-sized value (visible read).
 */
static INLINE stm_word_t
stm_wbetl_read_visible(stm_tx_t *tx, volatile stm_word_t *addr)
{
  volatile stm_word_t *lock;
  stm_word_t l, l2, t, value, version;
  w_entry_t *w;
  int decision;

  PRINT_DEBUG2("==> stm_wbetl_read_visible(t=%p[%lu-%lu],a=%p)\n", tx, (unsigned long)tx->start, (unsigned long)tx->end, addr);

  if (GET_STATUS(tx->status) == TX_KILLED) {
    stm_rollback(tx, STM_ABORT_KILLED);
    return 0;
  }

  /* Get reference to lock */
  lock = GET_LOCK(addr);

  /* Try to acquire lock */
 restart:
  l = ATOMIC_LOAD_ACQ(lock);
 restart_no_load:
  if (LOCK_GET_OWNED(l)) {
    /* Locked */
#ifdef UNIT_TX
    if (l == LOCK_UNIT) {
      /* Data modified by a unit store: should not last long => retry */
      goto restart;
    }
#endif /* UNIT_TX */
    /* Do we own the lock? */
    w = (w_entry_t *)LOCK_GET_ADDR(l);
    /* Simply check if address falls inside our write set (avoids non-faulting load) */
    if (tx->w_set.entries <= w && w < tx->w_set.entries + tx->w_set.nb_entries) {
      /* Yes: is it only read-locked? */
      if (!LOCK_GET_WRITE(l)) {
        /* Yes: get value from memory */
        value = ATOMIC_LOAD(addr);
      } else {
        /* No: did we previously write the same address? */
        while (1) {
          if (addr == w->addr) {
            /* Yes: get value from write set (or from memory if mask was empty) */
            value = (w->mask == 0 ? ATOMIC_LOAD(addr) : w->value);
            break;
          }
          if (w->next == NULL) {
            /* No: get value from memory */
            value = ATOMIC_LOAD(addr);
            break;
          }
          w = w->next;
        }
      }
      if (GET_STATUS(tx->status) == TX_KILLED) {
        stm_rollback(tx, STM_ABORT_KILLED);
        return 0;
      }
      /* No need to add to read set (will remain valid) */
      return value;
    }
    /* Conflict: CM kicks in */
# if defined(IRREVOCABLE_ENABLED) && defined(IRREVOCABLE_IMPROVED)
    if (tx->irrevocable && ATOMIC_LOAD(&_tinystm.irrevocable) == 1)
      ATOMIC_STORE(&_tinystm.irrevocable, 2);
# endif /* defined(IRREVOCABLE_ENABLED) && defined(IRREVOCABLE_IMPROVED) */
    t = w->tx->status;
    l2 = ATOMIC_LOAD_ACQ(lock);
    if (l != l2) {
      l = l2;
      goto restart_no_load;
    }
    if (t != w->tx->status) {
      /* Transaction status has changed: restart the whole procedure */
      goto restart;
    }
    if (GET_STATUS(t) == TX_KILLED) {
      /* We can safely steal lock */
      decision = KILL_OTHER;
    } else {
      decision =
# ifdef IRREVOCABLE_ENABLED
        GET_STATUS(tx->status) == TX_IRREVOCABLE ? KILL_OTHER :
        GET_STATUS(t) == TX_IRREVOCABLE ? KILL_SELF :
# endif /* IRREVOCABLE_ENABLED */
        GET_STATUS(tx->status) == TX_KILLED ? KILL_SELF :
        (_tinystm.contention_manager != NULL ? _tinystm.contention_manager(tx, w->tx, (LOCK_GET_WRITE(l) ? WR_CONFLICT : RR_CONFLICT)) : KILL_SELF);
      if (decision == KILL_OTHER) {
        /* Kill other */
        if (!stm_kill(tx, w->tx, t)) {
          /* Transaction may have committed or aborted: retry */
          goto restart;
        }
      }
    }
    if (decision == KILL_OTHER) {
      version = w->version;
      goto acquire;
    }
    /* Kill self */
    if ((decision & DELAY_RESTART) != 0)
      tx->c_lock = lock;
    /* Abort */
# ifdef CONFLICT_TRACKING
    if (_tinystm.conflict_cb != NULL) {
#  ifdef UNIT_TX
      if (l != LOCK_UNIT) {
#  endif /* UNIT_TX */
        /* Call conflict callback */
        stm_tx_t *other = ((w_entry_t *)LOCK_GET_ADDR(l))->tx;
        _tinystm.conflict_cb(tx, other);
#  ifdef UNIT_TX
      }
#  endif /* UNIT_TX */
    }
# endif /* CONFLICT_TRACKING */
    stm_rollback(tx, (LOCK_GET_WRITE(l) ? STM_ABORT_WR_CONFLICT : STM_ABORT_RR_CONFLICT));
    return 0;
  }
  /* Not locked */
  version = LOCK_GET_TIMESTAMP(l);
 acquire:
  /* Acquire lock (ETL) */
  if (tx->w_set.nb_entries == tx->w_set.size)
    stm_rollback(tx, STM_ABORT_EXTEND_WS);
  w = &tx->w_set.entries[tx->w_set.nb_entries];
  w->version = version;
  value = ATOMIC_LOAD(addr);
  if (ATOMIC_CAS_FULL(lock, l, LOCK_SET_ADDR_READ((stm_word_t)w)) == 0)
    goto restart;
  /* Add entry to write set */
  w->addr = addr;
  w->mask = 0;
  w->lock = lock;
  w->value = value;
  w->next = NULL;
  tx->w_set.nb_entries++;
  return value;
}
#endif /* CM == CM_MODULAR */

static INLINE stm_word_t
stm_wbetl_read(stm_tx_t *tx, volatile stm_word_t *addr)
{
#if CM == CM_MODULAR
  if (unlikely((tx->attr.visible_reads))) {
    /* Use visible read */
    return stm_wbetl_read_visible(tx, addr);
  }
#endif /* CM == CM_MODULAR */
  return stm_wbetl_read_invisible(tx, addr);
}

static INLINE w_entry_t *
stm_wbetl_write(stm_tx_t *tx, volatile stm_word_t *addr, stm_word_t value, stm_word_t mask)
{
  volatile stm_word_t *lock;
  stm_word_t l, version;
  w_entry_t *w;
  w_entry_t *prev = NULL;
#if CM == CM_MODULAR
  int decision;
  stm_word_t l2, t;
#endif /* CM == CM_MODULAR */

  PRINT_DEBUG2("==> stm_wbetl_write(t=%p[%lu-%lu],a=%p,d=%p-%lu,m=0x%lx)\n",
               tx, (unsigned long)tx->start, (unsigned long)tx->end, addr, (void *)value, (unsigned long)value, (unsigned long)mask);

  /* Get reference to lock */
  lock = GET_LOCK(addr);

  /* Try to acquire lock */
 restart:
  l = ATOMIC_LOAD_ACQ(lock);
 restart_no_load:
  if (unlikely(LOCK_GET_OWNED(l))) {
    /* Locked */

#ifdef UNIT_TX
    if (l == LOCK_UNIT) {
      /* Data modified by a unit store: should not last long => retry */
      goto restart;
    }
#endif /* UNIT_TX */

    /* Do we own the lock? */
    w = (w_entry_t *)LOCK_GET_ADDR(l);
    /* Simply check if address falls inside our write set (avoids non-faulting load) */
    if (likely(tx->w_set.entries <= w && w < tx->w_set.entries + tx->w_set.nb_entries)) {
      /* Yes */
#if CM == CM_MODULAR
      /* If read-locked: upgrade lock */
      if (!LOCK_GET_WRITE(l)) {
        if (ATOMIC_CAS_FULL(lock, l, LOCK_UPGRADE(l)) == 0) {
          /* Lock must have been stolen: abort */
          stm_rollback(tx, STM_ABORT_KILLED);
          return NULL;
        }
        tx->w_set.has_writes++;
      }
#endif /* CM == CM_MODULAR */
      if (mask == 0) {
        /* No need to insert new entry or modify existing one */
        return w;
      }
      prev = w;
      /* Did we previously write the same address? */
      while (1) {
        if (addr == prev->addr) {
          /* No need to add to write set */
          if (mask != ~(stm_word_t)0) {
            if (prev->mask == 0)
              prev->value = ATOMIC_LOAD(addr);
            value = (prev->value & ~mask) | (value & mask);
          }
          prev->value = value;
          prev->mask |= mask;
          return prev;
        }
        if (prev->next == NULL) {
          /* Remember last entry in linked list (for adding new entry) */
          break;
        }
        prev = prev->next;
      }
      /* Get version from previous write set entry (all entries in linked list have same version) */
      version = prev->version;
      /* Must add to write set */
      if (tx->w_set.nb_entries == tx->w_set.size)
        stm_rollback(tx, STM_ABORT_EXTEND_WS);
      w = &tx->w_set.entries[tx->w_set.nb_entries];
#if CM == CM_MODULAR
      w->version = version;
#endif /* CM == CM_MODULAR */
      goto do_write;
    }
    /* Conflict: CM kicks in */
#if defined(IRREVOCABLE_ENABLED) && defined(IRREVOCABLE_IMPROVED)
    if (tx->irrevocable && ATOMIC_LOAD(&_tinystm.irrevocable) == 1)
      ATOMIC_STORE(&_tinystm.irrevocable, 2);
#endif /* defined(IRREVOCABLE_ENABLED) && defined(IRREVOCABLE_IMPROVED) */
#if CM != CM_MODULAR && defined(IRREVOCABLE_ENABLED)
    if (tx->irrevocable) {
      /* Spin while locked */
      goto restart;
    }
#endif /* CM != CM_MODULAR && defined(IRREVOCABLE_ENABLED) */
#if CM == CM_MODULAR
    t = w->tx->status;
    l2 = ATOMIC_LOAD_ACQ(lock);
    if (l != l2) {
      l = l2;
      goto restart_no_load;
    }
    if (t != w->tx->status) {
      /* Transaction status has changed: restart the whole procedure */
      goto restart;
    }
    if (GET_STATUS(t) == TX_KILLED) {
      /* We can safely steal lock */
      decision = KILL_OTHER;
    } else {
      decision =
# ifdef IRREVOCABLE_ENABLED
        GET_STATUS(tx->status) == TX_IRREVOCABLE ? KILL_OTHER :
        GET_STATUS(t) == TX_IRREVOCABLE ? KILL_SELF :
# endif /* IRREVOCABLE_ENABLED */
        GET_STATUS(tx->status) == TX_KILLED ? KILL_SELF :
        (_tinystm.contention_manager != NULL ? _tinystm.contention_manager(tx, w->tx, WW_CONFLICT) : KILL_SELF);
      if (decision == KILL_OTHER) {
        /* Kill other */
        if (!stm_kill(tx, w->tx, t)) {
          /* Transaction may have committed or aborted: retry */
          goto restart;
        }
      }
    }
    if (decision == KILL_OTHER) {
      /* Handle write after reads (before CAS) */
      version = w->version;
      goto acquire;
    }
    /* Kill self */
    if ((decision & DELAY_RESTART) != 0)
      tx->c_lock = lock;
#elif CM == CM_DELAY
    tx->c_lock = lock;
#endif /* CM == CM_DELAY */
    /* Abort */
#ifdef CONFLICT_TRACKING
    if (_tinystm.conflict_cb != NULL) {
# ifdef UNIT_TX
      if (l != LOCK_UNIT) {
# endif /* UNIT_TX */
        /* Call conflict callback */
        stm_tx_t *other = ((w_entry_t *)LOCK_GET_ADDR(l))->tx;
        _tinystm.conflict_cb(tx, other);
# ifdef UNIT_TX
      }
# endif /* UNIT_TX */
    }
#endif /* CONFLICT_TRACKING */
    stm_rollback(tx, STM_ABORT_WW_CONFLICT);
    return NULL;
  }
  /* Not locked */
  /* Handle write after reads (before CAS) */
  version = LOCK_GET_TIMESTAMP(l);
#ifdef IRREVOCABLE_ENABLED
  /* In irrevocable mode, no need to revalidate */
  if (unlikely(tx->irrevocable))
    goto acquire_no_check;
#endif /* IRREVOCABLE_ENABLED */
 acquire:
  if (unlikely(version > tx->end)) {
    /* We might have read an older version previously */
#ifdef UNIT_TX
    if (unlikely(tx->attr.no_extend)) {
      stm_rollback(tx, STM_ABORT_VAL_WRITE);
      return NULL;
    }
#endif /* UNIT_TX */
    if (unlikely(stm_has_read(tx, lock) != NULL)) {
      /* Read version must be older (otherwise, tx->end >= version) */
      /* Not much we can do: abort */
#if CM == CM_MODULAR
      /* Abort caused by invisible reads */
      tx->visible_reads++;
#endif /* CM == CM_MODULAR */
      stm_rollback(tx, STM_ABORT_VAL_WRITE);
      return NULL;
    }
  }
  /* Acquire lock (ETL) */
#ifdef IRREVOCABLE_ENABLED
 acquire_no_check:
#endif /* IRREVOCABLE_ENABLED */
  if (unlikely(tx->w_set.nb_entries == tx->w_set.size))
    stm_rollback(tx, STM_ABORT_EXTEND_WS);
  w = &tx->w_set.entries[tx->w_set.nb_entries];
#if CM == CM_MODULAR
  w->version = version;
#endif /* if CM == CM_MODULAR */
  if (unlikely(ATOMIC_CAS_FULL(lock, l, LOCK_SET_ADDR_WRITE((stm_word_t)w)) == 0))
    goto restart;
  /* We own the lock here (ETL) */
do_write:
  /* Add address to write set */
  w->addr = addr;
  w->mask = mask;
  w->lock = lock;
  if (unlikely(mask == 0)) {
    /* Do not write anything */
#ifndef NDEBUG
    w->value = 0;
#endif /* ! NDEBUG */
  } else {
    /* Remember new value */
    if (mask != ~(stm_word_t)0)
      value = (ATOMIC_LOAD(addr) & ~mask) | (value & mask);
    w->value = value;
  }
#if CM != CM_MODULAR
  w->version = version;
#endif /* CM != CM_MODULAR */
  w->next = NULL;
  if (prev != NULL) {
    /* Link new entry in list */
    prev->next = w;
  }
  tx->w_set.nb_entries++;
  tx->w_set.has_writes++;

  return w;
}

static INLINE stm_word_t
stm_wbetl_RaR(stm_tx_t *tx, volatile stm_word_t *addr)
{
  /* Possible optimization: avoid adding to read set again */
  return stm_wbetl_read(tx, addr);
}

static INLINE stm_word_t
stm_wbetl_RaW(stm_tx_t *tx, volatile stm_word_t *addr)
{
  stm_word_t l;
  w_entry_t *w;

  l = ATOMIC_LOAD_ACQ(GET_LOCK(addr));
  /* Does the lock owned? */
  assert(LOCK_GET_WRITE(l));
  /* Do we own the lock? */
  w = (w_entry_t *)LOCK_GET_ADDR(l);
  assert(tx->w_set.entries <= w && w < tx->w_set.entries + tx->w_set.nb_entries);

  /* Read directly from write set entry. */
  return w->value;
}

static INLINE stm_word_t
stm_wbetl_RfW(stm_tx_t *tx, volatile stm_word_t *addr)
{
  /* Acquire lock as write. */
  stm_wbetl_write(tx, addr, 0, 0);
  /* Now the lock is owned, read directly from memory is safe. */
  /* TODO Unsafe with CM_MODULAR */
  return *addr;
}

static INLINE void
stm_wbetl_WaR(stm_tx_t *tx, volatile stm_word_t *addr, stm_word_t value, stm_word_t mask)
{
  /* Probably no optimization can be done here. */
  stm_wbetl_write(tx, addr, value, mask);
}

static INLINE void
stm_wbetl_WaW(stm_tx_t *tx, volatile stm_word_t *addr, stm_word_t value, stm_word_t mask)
{
  stm_word_t l;
  w_entry_t *w;

  l = ATOMIC_LOAD_ACQ(GET_LOCK(addr));
  /* Does the lock owned? */
  assert(LOCK_GET_WRITE(l));
  /* Do we own the lock? */
  w = (w_entry_t *)LOCK_GET_ADDR(l);
  assert(tx->w_set.entries <= w && w < tx->w_set.entries + tx->w_set.nb_entries);
  /* in WaW, mask can never be 0 */
  assert(mask != 0);
  while (1) {
    if (addr == w->addr) {
      /* No need to add to write set */
      if (mask != ~(stm_word_t)0) {
        if (w->mask == 0)
          w->value = ATOMIC_LOAD(addr);
        value = (w->value & ~mask) | (value & mask);
      }
      w->value = value;
      w->mask |= mask;
      return;
    }
    /* The entry must exist */
    assert (w->next != NULL);
    w = w->next;
  }
}

static INLINE int
stm_wbetl_commit(stm_tx_t *tx)
{
  w_entry_t *w;
  stm_word_t t;
  int i;

  PRINT_DEBUG("==> stm_wbetl_commit(%p[%lu-%lu])\n", tx, (unsigned long)tx->start, (unsigned long)tx->end);

#if CM == CM_MODULAR
  /* A read-only transaction with visible reads must simply drop locks */
  /* FIXME: if killed? */
  if (tx->w_set.has_writes == 0) {
    w = tx->w_set.entries;
    for (i = tx->w_set.nb_entries; i > 0; i--, w++) {
      /* Only drop lock for last covered address in write set */
      if (w->next == NULL)
        ATOMIC_STORE_REL(w->lock, LOCK_SET_TIMESTAMP(w->version));
    }
    /* Update clock so that future transactions get higher timestamp (liveness of timestamp CM) */
    FETCH_INC_CLOCK;
    goto end;
  }
#endif /* CM == CM_MODULAR */

  /* Update transaction */
#ifdef IRREVOCABLE_ENABLED
  /* Verify if there is an irrevocable transaction once all locks have been acquired */
# ifdef IRREVOCABLE_IMPROVED
  /* FIXME: it is bogus. the status should be changed to idle otherwise stm_quiesce will not progress */
  if (unlikely(!tx->irrevocable)) {
    do {
      t = ATOMIC_LOAD(&_tinystm.irrevocable);
      /* If the irrevocable transaction have encountered an acquired lock, abort */
      if (t == 2) {
        stm_rollback(tx, STM_ABORT_IRREVOCABLE);
        return 0;
      }
    } while (t);
  }
# else /* ! IRREVOCABLE_IMPROVED */
  if (!tx->irrevocable && ATOMIC_LOAD(&_tinystm.irrevocable)) {
    stm_rollback(tx, STM_ABORT_IRREVOCABLE);
    return 0;
  }
# endif /* ! IRREVOCABLE_IMPROVED */
#endif /* IRREVOCABLE_ENABLED */

  /* Get commit timestamp (may exceed VERSION_MAX by up to MAX_THREADS) */
  t = FETCH_INC_CLOCK + 1;
#ifdef IRREVOCABLE_ENABLED
  if (unlikely(tx->irrevocable))
    goto release_locks;
#endif /* IRREVOCABLE_ENABLED */

  /* Try to validate (only if a concurrent transaction has committed since tx->start) */
  if (unlikely(tx->start != t - 1 && !stm_wbetl_validate(tx))) {
    /* Cannot commit */
#if CM == CM_MODULAR
    /* Abort caused by invisible reads */
    tx->visible_reads++;
#endif /* CM == CM_MODULAR */
    stm_rollback(tx, STM_ABORT_VALIDATE);
    return 0;
  }

#ifdef IRREVOCABLE_ENABLED
  release_locks:
#endif /* IRREVOCABLE_ENABLED */

  /* Install new versions, drop locks and set new timestamp */
  w = tx->w_set.entries;
  for (i = tx->w_set.nb_entries; i > 0; i--, w++) {
    if (w->mask != 0)
      ATOMIC_STORE(w->addr, w->value);
    /* Only drop lock for last covered address in write set */
    if (w->next == NULL) {
# if CM == CM_MODULAR
      /* In case of visible read, reset lock to its previous timestamp */
      if (w->mask == 0)
        ATOMIC_STORE_REL(w->lock, LOCK_SET_TIMESTAMP(w->version));
      else
# endif /* CM == CM_MODULAR */
        ATOMIC_STORE_REL(w->lock, LOCK_SET_TIMESTAMP(t));
    }
  }

 end:
  return 1;
}

#endif /* _STM_WBETL_H_ */


================================================
FILE: stms/tinystm/src/stm_wt.h
================================================
/*
 * File:
 *   stm_wt.h
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * Description:
 *   STM internal functions.
 *
 * Copyright (c) 2007-2014.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * This program has a dual license and can also be distributed
 * under the terms of the MIT license.
 */

#ifndef _STM_WT_H_
#define _STM_WT_H_

static INLINE int
stm_wt_validate(stm_tx_t *tx)
{
  r_entry_t *r;
  int i;
  stm_word_t l;

  PRINT_DEBUG("==> stm_wt_validate(%p[%lu-%lu])\n", tx, (unsigned long)tx->start, (unsigned long)tx->end);

  /* Validate reads */
  r = tx->r_set.entries;
  for (i = tx->r_set.nb_entries; i > 0; i--, r++) {
    /* Read lock */
    l = ATOMIC_LOAD(r->lock);
    /* Unlocked and still the same version? */
    if (LOCK_GET_OWNED(l)) {
      /* Do we own the lock? */
      w_entry_t *w = (w_entry_t *)LOCK_GET_ADDR(l);
      /* Simply check if address falls inside our write set (avoids non-faulting load) */
      if (!(tx->w_set.entries <= w && w < tx->w_set.entries + tx->w_set.nb_entries))
      {
        /* Locked by another transaction: cannot validate */
#ifdef CONFLICT_TRACKING
        if (_tinystm.conflict_cb != NULL) {
# ifdef UNIT_TX
          if (l != LOCK_UNIT) {
# endif /* UNIT_TX */
            /* Call conflict callback */
            stm_tx_t *other = ((w_entry_t *)LOCK_GET_ADDR(l))->tx;
            _tinystm.conflict_cb(tx, other);
# ifdef UNIT_TX
          }
# endif /* UNIT_TX */
        }
#endif /* CONFLICT_TRACKING */
        return 0;
      }
      /* We own the lock: OK */
    } else {
      if (LOCK_GET_TIMESTAMP(l) != r->version) {
        /* Other version: cannot validate */
        return 0;
      }
      /* Same version: OK */
    }
  }
  return 1;
}

/*
 * Extend snapshot range.
 */
static INLINE int
stm_wt_extend(stm_tx_t *tx)
{
  stm_word_t now;

  PRINT_DEBUG("==> stm_wt_extend(%p[%lu-%lu])\n", tx, (unsigned long)tx->start, (unsigned long)tx->end);

#ifdef UNIT_TX
  /* Extension is disabled */
  if (tx->attr.no_extend)
    return 0;
#endif /* UNIT_TX */

  /* Get current time */
  now = GET_CLOCK;
  /* No need to check clock overflow here. The clock can exceed up to MAX_THREADS and it will be reset when the quiescence is reached. */

  /* Try to validate read set */
  if (stm_wt_validate(tx)) {
    /* It works: we can extend until now */
    tx->end = now;
    return 1;
  }
  return 0;
}

static INLINE void
stm_wt_rollback(stm_tx_t *tx)
{
  int i;
  w_entry_t *w;
  stm_word_t t;

  PRINT_DEBUG("==> stm_wt_rollback(%p[%lu-%lu])\n", tx, (unsigned long)tx->start, (unsigned long)tx->end);

  assert(IS_ACTIVE(tx->status));

  t = 0;
  /* Undo writes and drop locks */
  w = tx->w_set.entries;
  for (i = tx->w_set.nb_entries; i > 0; i--, w++) {
    stm_word_t j;
    /* Restore previous value */
    if (w->mask != 0)
      ATOMIC_STORE(w->addr, w->value);
    if (w->next != NULL)
      continue;
    /* Incarnation numbers allow readers to detect dirty reads */
    j = LOCK_GET_INCARNATION(w->version) + 1;
    if (j > INCARNATION_MAX) {
      /* Simple approach: write new version (might trigger unnecessary aborts) */
      if (t == 0) {
        /* Get new version (may exceed VERSION_MAX by up to MAX_THREADS) */
        t = FETCH_INC_CLOCK + 1;
      }
      ATOMIC_STORE_REL(w->lock, LOCK_SET_TIMESTAMP(t));
    } else {
      /* Use new incarnation number */
      ATOMIC_STORE_REL(w->lock, LOCK_UPD_INCARNATION(w->version, j));
    }
  }
  /* Make sure that all lock releases become visible */
  ATOMIC_MB_WRITE;
}

static INLINE void
stm_wt_add_to_rs(stm_tx_t *tx, stm_word_t version, volatile stm_word_t *lock)
{
  r_entry_t *r;

  /* No need to add to read set for read-only transaction */
  if (tx->attr.read_only)
    return;

#ifdef NO_DUPLICATES_IN_RW_SETS
  if (stm_has_read(tx, lock) != NULL)
    return value;
#endif /* NO_DUPLICATES_IN_RW_SETS */

  /* Add address and version to read set */
  if (tx->r_set.nb_entries == tx->r_set.size)
    stm_allocate_rs_entries(tx, 1);
  r = &tx->r_set.entries[tx->r_set.nb_entries++];
  r->version = version;
  r->lock = lock;
}

static INLINE stm_word_t
stm_wt_read(stm_tx_t *tx, volatile stm_word_t *addr)
{
  volatile stm_word_t *lock;
  stm_word_t l, l2, value, version;
  w_entry_t *w;

  PRINT_DEBUG2("==> stm_wt_read(t=%p[%lu-%lu],a=%p)\n", tx, (unsigned long)tx->start, (unsigned long)tx->end, addr);

  assert(IS_ACTIVE(tx->status));

  /* Get reference to lock */
  lock = GET_LOCK(addr);

  /* Note: we could check for duplicate reads and get value from read set */

  /* Read lock, value, lock */
 restart:
  l = ATOMIC_LOAD_ACQ(lock);
 restart_no_load:
  if (likely(!LOCK_GET_WRITE(l))) {
    /* Not locked */
    value = ATOMIC_LOAD_ACQ(addr);
    l2 = ATOMIC_LOAD_ACQ(lock);
    if (l != l2) {
      l = l2;
      goto restart_no_load;
    }

#ifdef IRREVOCABLE_ENABLED
    /* In irrevocable mode, no need check timestamp nor add entry to read set */
    if (unlikely(tx->irrevocable))
      goto no_check;
#endif /* IRREVOCABLE_ENABLED */

    /* Check timestamp */
    version = LOCK_GET_TIMESTAMP(l);

    /* Add to read set (update transactions only) */
    stm_wt_add_to_rs(tx, version, lock);

    /* Valid version? */
    if (unlikely(version > tx->end)) {
      /* No: try to extend first (except for read-only transactions: no read set) */
      if (tx->attr.read_only || !stm_wt_extend(tx)) {
        /* Not much we can do: abort */
        stm_rollback(tx, STM_ABORT_VAL_READ);
        return 0;
      }
      /* Worked: we now have a good version (version <= tx->end) */
    }

#ifdef IRREVOCABLE_ENABLED
 no_check:
#endif /* IRREVOCABLE_ENABLED */
    /* We have a good version: return value */
    return value;
  } else {
    /* Locked */
    /* Do we own the lock? */
    w = (w_entry_t *)LOCK_GET_ADDR(l);

    /* Simply check if address falls inside our write set (avoids non-faulting load) */
    if (likely(tx->w_set.entries <= w && w < tx->w_set.entries + tx->w_set.nb_entries)) {
      /* Yes: we have a version locked by us that was valid at write time */
      value = ATOMIC_LOAD(addr);
      /* No need to add to read set (will remain valid) */
      return value;
    }

# ifdef UNIT_TX
    if (l == LOCK_UNIT) {
      /* Data modified by a unit store: should not last long => retry */
      goto restart;
    }
# endif /* UNIT_TX */

    /* Conflict: CM kicks in (we could also check for duplicate reads and get value from read set) */
# if defined(IRREVOCABLE_ENABLED) && defined(IRREVOCABLE_IMPROVED)
    if (tx->irrevocable && ATOMIC_LOAD(&_tinystm.irrevocable) == 1)
      ATOMIC_STORE(&_tinystm.irrevocable, 2);
# endif /* defined(IRREVOCABLE_ENABLED) && defined(IRREVOCABLE_IMPROVED) */
# if defined(IRREVOCABLE_ENABLED)
    if (tx->irrevocable) {
      /* Spin while locked */
      goto restart;
    }
# endif /* defined(IRREVOCABLE_ENABLED) */
# if CM == CM_DELAY
    tx->c_lock = lock;
# endif /* CM == CM_DELAY */

    /* Abort */
# ifdef CONFLICT_TRACKING
    if (_tinystm.conflict_cb != NULL) {
#  ifdef UNIT_TX
      if (l != LOCK_UNIT) {
#  endif /* UNIT_TX */
        /* Call conflict callback */
        stm_tx_t *other = ((w_entry_t *)LOCK_GET_ADDR(l))->tx;
        _tinystm.conflict_cb(tx, other);
#  ifdef UNIT_TX
      }
#  endif /* UNIT_TX */
    }
# endif /* CONFLICT_TRACKING */

    stm_rollback(tx, STM_ABORT_RW_CONFLICT);
    return 0;
  }
}

static INLINE w_entry_t *
stm_wt_write(stm_tx_t *tx, volatile stm_word_t *addr, stm_word_t value, stm_word_t mask)
{
  volatile stm_word_t *lock;
  stm_word_t l, version;
  w_entry_t *w;
  w_entry_t *prev = NULL;

  PRINT_DEBUG2("==> stm_wt_write(t=%p[%lu-%lu],a=%p,d=%p-%lu,m=0x%lx)\n",
               tx, (unsigned long)tx->start, (unsigned long)tx->end, addr, (void *)value, (unsigned long)value, (unsigned long)mask);

  /* Get reference to lock */
  lock = GET_LOCK(addr);

  /* Try to acquire lock */
 restart:
  l = ATOMIC_LOAD_ACQ(lock);
 restart_no_load:
  if (LOCK_GET_OWNED(l)) {
    /* Locked */

#ifdef UNIT_TX
    if (l == LOCK_UNIT) {
      /* Data modified by a unit store: should not last long => retry */
      goto restart;
    }
#endif /* UNIT_TX */

    /* Do we own the lock? */
    w = (w_entry_t *)LOCK_GET_ADDR(l);
    /* Simply check if address falls inside our write set (avoids non-faulting load) */
    if (likely(tx->w_set.entries <= w && w < tx->w_set.entries + tx->w_set.nb_entries)) {
      if (mask == 0) {
        /* No need to insert new entry or modify existing one */
        return w;
      }
      prev = w;
      /* Did we previously write the same address? */
      while (1) {
        if (addr == prev->addr) {
          if (w->mask == 0) {
            /* Remember old value */
            w->value = ATOMIC_LOAD(addr);
            w->mask = mask;
          }
          /* Yes: only write to memory */
          if (mask != ~(stm_word_t)0)
            value = (ATOMIC_LOAD(addr) & ~mask) | (value & mask);
          ATOMIC_STORE(addr, value);
          return w;
        }
        if (prev->next == NULL) {
          /* Remember last entry in linked list (for adding new entry) */
          break;
        }
        prev = prev->next;
      }
      /* Must add to write set */
      if (tx->w_set.nb_entries == tx->w_set.size)
        stm_rollback(tx, STM_ABORT_EXTEND_WS);
      w = &tx->w_set.entries[tx->w_set.nb_entries];
      /* Get version from previous write set entry (all entries in linked list have same version) */
      w->version = prev->version;
      goto do_write;
    }
    /* Conflict: CM kicks in */
# if defined(IRREVOCABLE_ENABLED) && defined(IRREVOCABLE_IMPROVED)
    if (tx->irrevocable && ATOMIC_LOAD(&_tinystm.irrevocable) == 1)
      ATOMIC_STORE(&_tinystm.irrevocable, 2);
# endif /* defined(IRREVOCABLE_ENABLED) && defined(IRREVOCABLE_IMPROVED) */
# if defined(IRREVOCABLE_ENABLED)
    if (tx->irrevocable) {
      /* Spin while locked */
      goto restart;
    }
# endif /* defined(IRREVOCABLE_ENABLED) */
# if CM == CM_DELAY
    tx->c_lock = lock;
# endif /* CM == CM_DELAY */

    /* Abort */
# ifdef CONFLICT_TRACKING
    if (_tinystm.conflict_cb != NULL) {
#  ifdef UNIT_TX
      if (l != LOCK_UNIT) {
#  endif /* UNIT_TX */
        /* Call conflict callback */
        stm_tx_t *other = ((w_entry_t *)LOCK_GET_ADDR(l))->tx;
        _tinystm.conflict_cb(tx, other);
#  ifdef UNIT_TX
      }
#  endif /* UNIT_TX */
    }
# endif /* CONFLICT_TRACKING */

    stm_rollback(tx, STM_ABORT_WW_CONFLICT);
    return NULL;
  }
  /* Not locked */
  /* Handle write after reads (before CAS) */
  version = LOCK_GET_TIMESTAMP(l);
#ifdef IRREVOCABLE_ENABLED
  /* In irrevocable mode, no need to revalidate */
  if (tx->irrevocable)
    goto acquire_no_check;
#endif /* IRREVOCABLE_ENABLED */
 acquire:
  if (unlikely(version > tx->end)) {
    /* We might have read an older version previously */
#ifdef UNIT_TX
    if (tx->attr.no_extend) {
      stm_rollback(tx, STM_ABORT_VAL_WRITE);
      return NULL;
    }
#endif /* UNIT_TX */
    if (stm_has_read(tx, lock) != NULL) {
      /* Read version must be older (otherwise, tx->end >= version) */
      /* Not much we can do: abort */
      stm_rollback(tx, STM_ABORT_VAL_WRITE);
      return NULL;
    }
  }
  /* Acquire lock (ETL) */
#ifdef IRREVOCABLE_ENABLED
 acquire_no_check:
#endif /* IRREVOCABLE_ENABLED */
  if (tx->w_set.nb_entries == tx->w_set.size)
    stm_rollback(tx, STM_ABORT_EXTEND_WS);
  w = &tx->w_set.entries[tx->w_set.nb_entries];
  if (ATOMIC_CAS_FULL(lock, l, LOCK_SET_ADDR_WRITE((stm_word_t)w)) == 0)
    goto restart;
  /* We store the old value of the lock (timestamp and incarnation) */
  w->version = l;
  /* We own the lock here (ETL) */
do_write:
  /* Add address to write set */
  w->addr = addr;
  w->mask = mask;
  w->lock = lock;
  if (mask == 0) {
    /* Do not write anything */
#ifndef NDEBUG
    w->value = 0;
#endif /* ! NDEBUG */
  } else {
    /* Remember old value */
    w->value = ATOMIC_LOAD(addr);
  }
  if (mask != 0) {
    if (mask != ~(stm_word_t)0)
      value = (w->value & ~mask) | (value & mask);
    ATOMIC_STORE(addr, value);
  }
  w->next = NULL;
  if (prev != NULL) {
    /* Link new entry in list */
    prev->next = w;
  }
  tx->w_set.nb_entries++;

  return w;
}

static INLINE stm_word_t
stm_wt_RaR(stm_tx_t *tx, volatile stm_word_t *addr)
{
  /* TODO same as fast read but no need to add into the RS */
  return stm_wt_read(tx, addr);
}

static INLINE stm_word_t
stm_wt_RaW(stm_tx_t *tx, volatile stm_word_t *addr)
{
#ifndef NDEBUG
  stm_word_t l;
  w_entry_t *w;
  l = ATOMIC_LOAD_ACQ(GET_LOCK(addr));
  /* Does the lock owned? */
  assert(LOCK_GET_WRITE(l));
  /* Do we own the lock? */
  w = (w_entry_t *)LOCK_GET_ADDR(l);
  assert(tx->w_set.entries <= w && w < tx->w_set.entries + tx->w_set.nb_entries);
#endif /* ! NDEBUG */

  /* Read directly from memory. */
  return *addr;
}

static INLINE stm_word_t
stm_wt_RfW(stm_tx_t *tx, volatile stm_word_t *addr)
{
  /* Acquire lock as write. */
  stm_wt_write(tx, addr, 0, 0);
  /* Now the lock is owned, read directly from memory is safe. */
  return *addr;
}

static INLINE void
stm_wt_WaR(stm_tx_t *tx, volatile stm_word_t *addr, stm_word_t value, stm_word_t mask)
{
  /* Probably no optimization can be done here. */
  stm_wt_write(tx, addr, value, mask);
}

static INLINE void
stm_wt_WaW(stm_tx_t *tx, volatile stm_word_t *addr, stm_word_t value, stm_word_t mask)
{
#ifndef NDEBUG
  stm_word_t l;
  w_entry_t *w;
  l = ATOMIC_LOAD_ACQ(GET_LOCK(addr));
  /* Does the lock owned? */
  assert(LOCK_GET_WRITE(l));
  /* Do we own the lock? */
  w = (w_entry_t *)LOCK_GET_ADDR(l);
  assert(tx->w_set.entries <= w && w < tx->w_set.entries + tx->w_set.nb_entries);
  /* in WaW, mask can never be 0 */
  assert(mask != 0);
#endif /* ! NDEBUG */
  if (mask != ~(stm_word_t)0) {
    value = (ATOMIC_LOAD(addr) & ~mask) | (value & mask);
  }
  ATOMIC_STORE(addr, value);
}

static INLINE int
stm_wt_commit(stm_tx_t *tx)
{
  w_entry_t *w;
  stm_word_t t;
  int i;

  PRINT_DEBUG("==> stm_wt_commit(%p[%lu-%lu])\n", tx, (unsigned long)tx->start, (unsigned long)tx->end);

  /* Update transaction */
#ifdef IRREVOCABLE_ENABLED
  /* Verify if there is an irrevocable transaction once all locks have been acquired */
# ifdef IRREVOCABLE_IMPROVED
  /* FIXME: it is bogus. the status should be changed to idle otherwise stm_quiesce will not progress */
  if (unlikely(!tx->irrevocable)) {
    do {
      t = ATOMIC_LOAD(&_tinystm.irrevocable);
      /* If the irrevocable transaction have encountered an acquired lock, abort */
      if (t == 2) {
        stm_rollback(tx, STM_ABORT_IRREVOCABLE);
        return 0;
      }
    } while (t);
  }
# else /* ! IRREVOCABLE_IMPROVED */
  if (!tx->irrevocable && ATOMIC_LOAD(&_tinystm.irrevocable)) {
    stm_rollback(tx, STM_ABORT_IRREVOCABLE);
    return 0;
  }
# endif /* ! IRREVOCABLE_IMPROVED */
#endif /* IRREVOCABLE_ENABLED */

  /* Get commit timestamp (may exceed VERSION_MAX by up to MAX_THREADS) */
  t = FETCH_INC_CLOCK + 1;

#ifdef IRREVOCABLE_ENABLED
  if (unlikely(tx->irrevocable))
    goto release_locks;
#endif /* IRREVOCABLE_ENABLED */

  /* Try to validate (only if a concurrent transaction has committed since tx->start) */
  if (unlikely(tx->start != t - 1 && !stm_wt_validate(tx))) {
    /* Cannot commit */
    stm_rollback(tx, STM_ABORT_VALIDATE);
    return 0;
  }

#ifdef IRREVOCABLE_ENABLED
  release_locks:
#endif /* IRREVOCABLE_ENABLED */

  /* Make sure that the updates become visible before releasing locks */
  ATOMIC_MB_WRITE;
  /* Drop locks and set new timestamp */
  w = tx->w_set.entries;
  for (i = tx->w_set.nb_entries; i > 0; i--, w++) {
    if (w->next == NULL) {
      /* No need for CAS (can only be modified by owner transaction) */
      ATOMIC_STORE(w->lock, LOCK_SET_TIMESTAMP(t));
    }
  }
  /* Make sure that all lock releases become visible */
  /* TODO: is ATOMIC_MB_WRITE required? */
  ATOMIC_MB_WRITE;
end:
  return 1;
}

#endif /* _STM_WT_H_ */


================================================
FILE: stms/tinystm/src/tls.h
================================================
/*
 * File:
 *   tls.h
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * Description:
 *   STM functions.
 *
 * Copyright (c) 2007-2014.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * This program has a dual license and can also be distributed
 * under the terms of the MIT license.
 */

/* ################################################################### *
 * THREAD-LOCAL
 * ################################################################### *
 * Some notes about compiler thread local support
 *
 * __thread keyword supported by:
 *   * GCC
 *   * Intel C compiler (linux)
 *   * Sun CC (-xthreadvar must be specified)
 *   * IBM XL
 *
 * __declspec(thread) supported by:
 *   * Microsoft MSVC
 *   * Intel C compiler (windows)
 *   * Borland C
 *
 * __declspec(__thread) supported by:
 *   * HP compiler
 *
 * Another way to support thread locals is POSIX thread library (pthread).
 * Darwin (MacOS) has inline version for POSIX thread local.
 *
 * Finally, the GLIBC has some specific entries for TM in the thread task
 * control block (TCB).
 *
 */

#ifndef _TLS_H_
#define _TLS_H_

#if !defined(TLS_COMPILER) && !defined(TLS_POSIX) && !defined(TLS_DARWIN) && !defined(TLS_GLIBC)
# error "TLS is not defined correctly (TLS_COMPILER, TLS_POSIX, TLS_DARWIN, TLS_GLIBC)"
#endif /* !defined(TLS_COMPILER) && !defined(TLS_POSIX) && !defined(TLS_DARWIN) && !defined(TLS_GLIBC) */

#include "utils.h"

struct stm_tx;

#if defined(TLS_GLIBC)

/* TODO : this is x86 specific */
# ifdef __LP64__
#  define SEG_READ(OFS)    "movq\t%%fs:(" #OFS "*8),%0"
#  define SEG_WRITE(OFS)   "movq\t%0,%%fs:(" #OFS "*8)"
# else
#  define SEG_READ(OFS)    "movl\t%%gs:(" #OFS "*4),%0"
#  define SEG_WRITE(OFS)   "movl\t%0,%%gs:(" #OFS "*4)"
# endif

static INLINE void
tls_init(void)
{
}

static INLINE void
tls_exit(void)
{
}

static INLINE struct stm_tx *
tls_get_tx(void)
{
  struct stm_tx *r;
  asm volatile (SEG_READ(10) : "=r"(r));
  return r;
}

static INLINE long
tls_get_gc(void)
{
  long r;
  asm volatile (SEG_READ(11) : "=r"(r));
  return r;
}

static INLINE void
tls_set_tx(struct stm_tx *tx)
{
  asm volatile (SEG_WRITE(10) : : "r"(tx));
}

static INLINE void
tls_set_gc(long gc)
{
  asm volatile (SEG_WRITE(11) : : "r"(gc));
}


#elif defined(TLS_COMPILER)
extern __thread struct stm_tx * thread_tx;
extern __thread long thread_gc;

static INLINE void
tls_init(void)
{
  thread_tx = NULL;
  thread_gc = 0;
}

static INLINE void
tls_exit(void)
{
  thread_tx = NULL;
  thread_gc = 0;
}

static INLINE struct stm_tx *
tls_get_tx(void)
{
  return thread_tx;
}

static INLINE long
tls_get_gc(void)
{
  return thread_gc;
}

static INLINE void
tls_set_tx(struct stm_tx *tx)
{
  thread_tx = tx;
}

static INLINE void
tls_set_gc(long gc)
{
  thread_gc = gc;
}


#elif defined(TLS_POSIX) || defined(TLS_DARWIN)

#include <stdio.h>
#include <stdlib.h>
# if defined(TLS_DARWIN)
/* Contains inline version for pthread_getspecific. */
#  include <pthreads/pthread_machdep.h>
# endif /* defined(TLS_DARWIN) */

extern pthread_key_t thread_tx;
extern pthread_key_t thread_gc;

static INLINE void
tls_init(void)
{
  if (pthread_key_create(&thread_tx, NULL) != 0
      || pthread_key_create(&thread_gc, NULL) != 0) {
    fprintf(stderr, "Error creating thread local\n");
    exit(1);
  }
}

static INLINE void
tls_exit(void)
{
  pthread_key_delete(thread_tx);
  pthread_key_delete(thread_gc);
}

/*
 * Returns the transaction descriptor for the CURRENT thread.
 */
static INLINE struct stm_tx*
tls_get_tx(void)
{
# if defined(TLS_POSIX)
  return (struct stm_tx *)pthread_getspecific(thread_tx);
# elif defined(TLS_DARWIN)
  return (struct stm_tx *)_pthread_getspecific_direct(thread_tx);
# endif /* defined(TLS_DARWIN) */
}

static INLINE long
tls_get_gc(void)
{
# if defined(TLS_POSIX)
  return (long)pthread_getspecific(thread_gc);
# elif defined(TLS_DARWIN)
  return (long)_pthread_getspecific_direct(thread_gc);
# endif /* defined(TLS_DARWIN) */
}

/*
 * Set the transaction descriptor for the CURRENT thread.
 */
static INLINE void
tls_set_tx(struct stm_tx *tx)
{
# if defined(TLS_POSIX)
  pthread_setspecific(thread_tx, tx);
# elif defined(TLS_DARWIN)
  _pthread_setspecific_direct(thread_tx, tx);
# endif /* defined(TLS_DARWIN) */
}

static INLINE void
tls_set_gc(long gc)
{
# if defined(TLS_POSIX)
  pthread_setspecific(thread_gc, (void *)gc);
# elif defined(TLS_DARWIN)
  _pthread_setspecific_direct(thread_gc, (void *)gc);
# endif /* defined(TLS_DARWIN) */
}
#endif /* defined(TLS_POSIX) || defined(TLS_DARWIN) */

#endif /* _TLS_H_ */


================================================
FILE: stms/tinystm/src/utils.h
================================================
/*
 * File:
 *   utils.h
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * Description:
 *   Utilities functions.
 *
 * Copyright (c) 2007-2014.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * This program has a dual license and can also be distributed
 * under the terms of the MIT license.
 */

#ifndef _UTILS_H_
#define _UTILS_H_

#include <stdio.h>
#include <stdlib.h>

#define COMPILE_TIME_ASSERT(pred)       switch (0) { case 0: case pred: ; }

#ifdef DEBUG2
# ifndef DEBUG
#  define DEBUG
# endif /* ! DEBUG */
#endif /* DEBUG2 */

#ifdef DEBUG
/* Note: stdio is thread-safe */
# define IO_FLUSH                       fflush(NULL)
# define PRINT_DEBUG(...)               printf(__VA_ARGS__); fflush(NULL)
#else /* ! DEBUG */
# define IO_FLUSH
# define PRINT_DEBUG(...)
#endif /* ! DEBUG */

#ifdef DEBUG2
# define PRINT_DEBUG2(...)              PRINT_DEBUG(__VA_ARGS__)
#else /* ! DEBUG2 */
# define PRINT_DEBUG2(...)
#endif /* ! DEBUG2 */

#define XSTR(s)                         STR(s)
#define STR(s)                          #s

#ifndef CACHELINE_SIZE
/* It ensures efficient usage of cache and avoids false sharing.
 * It could be defined in an architecture specific file. */
# define CACHELINE_SIZE                 64
#endif

#if defined(__GNUC__) || defined(__INTEL_COMPILER)
# define likely(x)                      __builtin_expect(!!(x), 1)
# define unlikely(x)                    __builtin_expect(!!(x), 0)
# define INLINE                         inline __attribute__((always_inline))
# define NOINLINE                       __attribute__((noinline))
# if defined(__INTEL_COMPILER)
#  define ALIGNED                       /* Unknown */
# else /* ! __INTEL_COMPILER */
#  define ALIGNED                       __attribute__((aligned(CACHELINE_SIZE)))
# endif /* ! __INTEL_COMPILER */
#else /* ! (defined(__GNUC__) || defined(__INTEL_COMPILER)) */
# define likely(x)                      (x)
# define unlikely(x)                    (x)
# define INLINE                         inline
# define NOINLINE                       /* None in the C standard */
# define ALIGNED                        /* None in the C standard */
#endif /* ! (defined(__GNUC__) || defined(__INTEL_COMPILER)) */

/*
 * malloc/free wrappers.
 */
static INLINE void*
xmalloc(size_t size)
{
  void *memptr = malloc(size);
  if (unlikely(memptr == NULL)) {
    perror("malloc");
    exit(1);
  }
  return memptr;
}

static INLINE void*
xcalloc(size_t count, size_t size)
{
  void *memptr = calloc(count, size);
  if (unlikely(memptr == NULL)) {
    perror("calloc");
    exit(1);
  }
  return memptr;
}

static INLINE void*
xrealloc(void *addr, size_t size)
{
  addr = realloc(addr, size);
  if (unlikely(addr == NULL)) {
    perror("realloc");
    exit(1);
  }
  return addr;
}

static INLINE void
xfree(void *mem)
{
  free(mem);
}

static INLINE void*
xmalloc_aligned(size_t size)
{
  void *memptr;
  /* TODO is posix_memalign is not available, provide malloc fallback. */
  /* Make sure that the allocation is aligned with cacheline size. */
#if defined(__CYGWIN__) || defined (__sun__)
  memptr = memalign(CACHELINE_SIZE, size);
#elif defined(__APPLE__)
  memptr = valloc(size);
#else
  if (unlikely(posix_memalign(&memptr, CACHELINE_SIZE, size)))
    memptr = NULL;
#endif
  if (unlikely(memptr == NULL)) {
    fprintf(stderr, "Error allocating aligned memory\n");
    exit(1);
  }
  return memptr;
}

#endif /* !_UTILS_H_ */


================================================
FILE: stms/tinystm/src/wrappers.c
================================================
/*
 * File:
 *   wrappers.c
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * Description:
 *   STM wrapper functions for different data types.
 *
 * Copyright (c) 2007-2014.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * This program has a dual license and can also be distributed
 * under the terms of the MIT license.
 */

#include <assert.h>

#include "utils.h"
#include "stm_internal.h"
#include "wrappers.h"

#define ALLOW_MISALIGNED_ACCESSES

#define TM_LOAD(addr)                stm_load(addr)
#define TM_STORE(addr, val)          stm_store(addr, val)
#define TM_STORE2(addr, val, mask)   stm_store2(addr, val, mask)

typedef union convert_64 {
  uint64_t u64;
  uint32_t u32[2];
  uint16_t u16[4];
  uint8_t u8[8];
  int64_t s64;
  double d;
} convert_64_t;

typedef union convert_32 {
  uint32_t u32;
  uint16_t u16[2];
  uint8_t u8[4];
  int32_t s32;
  float f;
} convert_32_t;

typedef union convert_16 {
  uint16_t u16;
  int16_t s16;
} convert_16_t;

typedef union convert_8 {
  uint8_t u8;
  int8_t s8;
} convert_8_t;

typedef union convert {
  stm_word_t w;
  uint8_t b[sizeof(stm_word_t)];
} convert_t;

static void sanity_checks(void)
{
  COMPILE_TIME_ASSERT(sizeof(convert_64_t) == 8);
  COMPILE_TIME_ASSERT(sizeof(convert_32_t) == 4);
  COMPILE_TIME_ASSERT(sizeof(stm_word_t) == 4 || sizeof(stm_word_t) == 8);
  COMPILE_TIME_ASSERT(sizeof(char) == 1);
  COMPILE_TIME_ASSERT(sizeof(short) == 2);
  COMPILE_TIME_ASSERT(sizeof(int) == 4);
  COMPILE_TIME_ASSERT(sizeof(long) == 4 || sizeof(long) == 8);
  COMPILE_TIME_ASSERT(sizeof(float) == 4);
  COMPILE_TIME_ASSERT(sizeof(double) == 8);
}

/* ################################################################### *
 * INLINE LOADS
 * ################################################################### */

static INLINE
uint8_t int_stm_load_u8(volatile uint8_t *addr)
{
  if (sizeof(stm_word_t) == 4) {
    convert_32_t val;
    val.u32 = (uint32_t)TM_LOAD((volatile stm_word_t *)((uintptr_t)addr & ~(uintptr_t)0x03));
    return val.u8[(uintptr_t)addr & 0x03];
  } else {
    convert_64_t val;
    val.u64 = (uint64_t)TM_LOAD((volatile stm_word_t *)((uintptr_t)addr & ~(uintptr_t)0x07));
    return val.u8[(uintptr_t)addr & 0x07];
  }
}

static INLINE
uint16_t int_stm_load_u16(volatile uint16_t *addr)
{
  if (unlikely(((uintptr_t)addr & 0x01) != 0)) {
    uint16_t val;
    stm_load_bytes((volatile uint8_t *)addr, (uint8_t *)&val, sizeof(uint16_t));
    return val;
  } else if (sizeof(stm_word_t) == 4) {
    convert_32_t val;
    val.u32 = (uint32_t)TM_LOAD((volatile stm_word_t *)((uintptr_t)addr & ~(uintptr_t)0x03));
    return val.u16[((uintptr_t)addr & 0x03) >> 1];
  } else {
    convert_64_t val;
    val.u64 = (uint64_t)TM_LOAD((volatile stm_word_t *)((uintptr_t)addr & ~(uintptr_t)0x07));
    return val.u16[((uintptr_t)addr & 0x07) >> 1];
  }
}

static INLINE
uint32_t int_stm_load_u32(volatile uint32_t *addr)
{
  if (unlikely(((uintptr_t)addr & 0x03) != 0)) {
    uint32_t val;
    stm_load_bytes((volatile uint8_t *)addr, (uint8_t *)&val, sizeof(uint32_t));
    return val;
  } else if (sizeof(stm_word_t) == 4) {
    return (uint32_t)TM_LOAD((volatile stm_word_t *)addr);
  } else {
    convert_64_t val;
    val.u64 = (uint64_t)TM_LOAD((volatile stm_word_t *)((uintptr_t)addr & ~(uintptr_t)0x07));
    return val.u32[((uintptr_t)addr & 0x07) >> 2];
  }
}

static INLINE
uint64_t int_stm_load_u64(volatile uint64_t *addr)
{
  if (unlikely(((uintptr_t)addr & 0x07) != 0)) {
    uint64_t val;
    stm_load_bytes((volatile uint8_t *)addr, (uint8_t *)&val, sizeof(uint64_t));
    return val;
  } else if (sizeof(stm_word_t) == 4) {
    convert_64_t val;
    val.u32[0] = (uint32_t)TM_LOAD((volatile stm_word_t *)addr);
    val.u32[1] = (uint32_t)TM_LOAD((volatile stm_word_t *)addr + 1);
    return val.u64;
  } else {
    return (uint64_t)TM_LOAD((volatile stm_word_t *)addr);
  }
}

/* ################################################################### *
 * LOADS
 * ################################################################### */

_CALLCONV uint8_t stm_load_u8(volatile uint8_t *addr)
{
  return int_stm_load_u8(addr);
}

_CALLCONV uint16_t stm_load_u16(volatile uint16_t *addr)
{
  return int_stm_load_u16(addr);
}

_CALLCONV uint32_t stm_load_u32(volatile uint32_t *addr)
{
  return int_stm_load_u32(addr);
}

_CALLCONV uint64_t stm_load_u64(volatile uint64_t *addr)
{
  return int_stm_load_u64(addr);
}

_CALLCONV char stm_load_char(volatile char *addr)
{
  convert_8_t val;
  val.u8 = int_stm_load_u8((volatile uint8_t *)addr);
  return val.s8;
}

_CALLCONV unsigned char stm_load_uchar(volatile unsigned char *addr)
{
  return (unsigned char)int_stm_load_u8((volatile uint8_t *)addr);
}

_CALLCONV short stm_load_short(volatile short *addr)
{
  convert_16_t val;
  val.u16 = int_stm_load_u16((volatile uint16_t *)addr);
  return val.s16;
}

_CALLCONV unsigned short stm_load_ushort(volatile unsigned short *addr)
{
  return (unsigned short)int_stm_load_u16((volatile uint16_t *)addr);
}

_CALLCONV int stm_load_int(volatile int *addr)
{
  convert_32_t val;
  val.u32 = int_stm_load_u32((volatile uint32_t *)addr);
  return val.s32;
}

_CALLCONV unsigned int stm_load_uint(volatile unsigned int *addr)
{
  return (unsigned int)int_stm_load_u32((volatile uint32_t *)addr);
}

_CALLCONV long stm_load_long(volatile long *addr)
{
  if (sizeof(long) == 4) {
    convert_32_t val;
    val.u32 = int_stm_load_u32((volatile uint32_t *)addr);
    return val.s32;
  } else {
    convert_64_t val;
    val.u64 = int_stm_load_u64((volatile uint64_t *)addr);
    return val.s64;
  }
}

_CALLCONV unsigned long stm_load_ulong(volatile unsigned long *addr)
{
  if (sizeof(long) == 4) {
    return (unsigned long)int_stm_load_u32((volatile uint32_t *)addr);
  } else {
    return (unsigned long)int_stm_load_u64((volatile uint64_t *)addr);
  }
}

_CALLCONV float stm_load_float(volatile float *addr)
{
  convert_32_t val;
  val.u32 = int_stm_load_u32((volatile uint32_t *)addr);
  return val.f;
}

_CALLCONV double stm_load_double(volatile double *addr)
{
  convert_64_t val;
  val.u64 = int_stm_load_u64((volatile uint64_t *)addr);
  return val.d;
}

_CALLCONV void *stm_load_ptr(volatile void **addr)
{
  union { stm_word_t w; void *v; } convert;
  convert.w = TM_LOAD((stm_word_t *)addr);
  return convert.v;
}

_CALLCONV void stm_load_bytes(volatile uint8_t *addr, uint8_t *buf, size_t size)
{
  convert_t val;
  unsigned int i;
  stm_word_t *a;

  if (size == 0)
    return;
  i = (uintptr_t)addr & (sizeof(stm_word_t) - 1);
  if (i != 0) {
    /* First bytes */
    a = (stm_word_t *)((uintptr_t)addr & ~(uintptr_t)(sizeof(stm_word_t) - 1));
    val.w = TM_LOAD(a++);
    for (; i < sizeof(stm_word_t) && size > 0; i++, size--)
      *buf++ = val.b[i];
  } else
    a = (stm_word_t *)addr;
  /* Full words */
  while (size >= sizeof(stm_word_t)) {
#ifdef ALLOW_MISALIGNED_ACCESSES
    *((stm_word_t *)buf) = TM_LOAD(a++);
    buf += sizeof(stm_word_t);
#else /* ! ALLOW_MISALIGNED_ACCESSES */
    val.w = TM_LOAD(a++);
    for (i = 0; i < sizeof(stm_word_t); i++)
      *buf++ = val.b[i];
#endif /* ! ALLOW_MISALIGNED_ACCESSES */
    size -= sizeof(stm_word_t);
  }
  if (size > 0) {
    /* Last bytes */
    val.w = TM_LOAD(a);
    i = 0;
    for (i = 0; size > 0; i++, size--)
      *buf++ = val.b[i];
  }
}

/* ################################################################### *
 * INLINE STORES
 * ################################################################### */

static INLINE
void int_stm_store_u8(volatile uint8_t *addr, uint8_t value)
{
  if (sizeof(stm_word_t) == 4) {
    convert_32_t val, mask;
    val.u8[(uintptr_t)addr & 0x03] = value;
    mask.u32 = 0;
    mask.u8[(uintptr_t)addr & 0x03] = ~(uint8_t)0;
    TM_STORE2((volatile stm_word_t *)((uintptr_t)addr & ~(uintptr_t)0x03), (stm_word_t)val.u32, (stm_word_t)mask.u32);
  } else {
    convert_64_t val, mask;
    val.u8[(uintptr_t)addr & 0x07] = value;
    mask.u64 = 0;
    mask.u8[(uintptr_t)addr & 0x07] = ~(uint8_t)0;
    TM_STORE2((volatile stm_word_t *)((uintptr_t)addr & ~(uintptr_t)0x07), (stm_word_t)val.u64, (stm_word_t)mask.u64);
  }
}

static INLINE
void int_stm_store_u16(volatile uint16_t *addr, uint16_t value)
{
  if (unlikely(((uintptr_t)addr & 0x01) != 0)) {
    stm_store_bytes((volatile uint8_t *)addr, (uint8_t *)&value, sizeof(uint16_t));
  } else if (sizeof(stm_word_t) == 4) {
    convert_32_t val, mask;
    val.u16[((uintptr_t)addr & 0x03) >> 1] = value;
    mask.u32 = 0;
    mask.u16[((uintptr_t)addr & 0x03) >> 1] = ~(uint16_t)0;
    TM_STORE2((volatile stm_word_t *)((uintptr_t)addr & ~(uintptr_t)0x03), (stm_word_t)val.u32, (stm_word_t)mask.u32);
  } else {
    convert_64_t val, mask;
    val.u16[((uintptr_t)addr & 0x07) >> 1] = value;
    mask.u64 = 0;
    mask.u16[((uintptr_t)addr & 0x07) >> 1] = ~(uint16_t)0;
    TM_STORE2((volatile stm_word_t *)((uintptr_t)addr & ~(uintptr_t)0x07), (stm_word_t)val.u64, (stm_word_t)mask.u64);
  }
}

static INLINE
void int_stm_store_u32(volatile uint32_t *addr, uint32_t value)
{
  if (unlikely(((uintptr_t)addr & 0x03) != 0)) {
    stm_store_bytes((volatile uint8_t *)addr, (uint8_t *)&value, sizeof(uint32_t));
  } else if (sizeof(stm_word_t) == 4) {
    TM_STORE((volatile stm_word_t *)addr, (stm_word_t)value);
  } else {
    convert_64_t val, mask;
    val.u32[((uintptr_t)addr & 0x07) >> 2] = value;
    mask.u64 = 0;
    mask.u32[((uintptr_t)addr & 0x07) >> 2] = ~(uint32_t)0;
    TM_STORE2((volatile stm_word_t *)((uintptr_t)addr & ~(uintptr_t)0x07), (stm_word_t)val.u64, (stm_word_t)mask.u64);
  }
}

static INLINE
void int_stm_store_u64(volatile uint64_t *addr, uint64_t value)
{
  if (unlikely(((uintptr_t)addr & 0x07) != 0)) {
    stm_store_bytes((volatile uint8_t *)addr, (uint8_t *)&value, sizeof(uint64_t));
  } else if (sizeof(stm_word_t) == 4) {
    convert_64_t val;
    val.u64 = value;
    TM_STORE((volatile stm_word_t *)addr, (stm_word_t)val.u32[0]);
    TM_STORE((volatile stm_word_t *)addr + 1, (stm_word_t)val.u32[1]);
  } else {
    return TM_STORE((volatile stm_word_t *)addr, (stm_word_t)value);
  }
}

/* ################################################################### *
 * STORES
 * ################################################################### */

_CALLCONV void stm_store_u8(volatile uint8_t *addr, uint8_t value)
{
  int_stm_store_u8(addr, value);
}

_CALLCONV void stm_store_u16(volatile uint16_t *addr, uint16_t value)
{
  int_stm_store_u16(addr, value);
}

_CALLCONV void stm_store_u32(volatile uint32_t *addr, uint32_t value)
{
  int_stm_store_u32(addr, value);
}

_CALLCONV void stm_store_u64(volatile uint64_t *addr, uint64_t value)
{
  int_stm_store_u64(addr, value);
}

_CALLCONV void stm_store_char(volatile char *addr, char value)
{
  convert_8_t val;
  val.s8 = value;
  int_stm_store_u8((volatile uint8_t *)addr, val.u8);
}

_CALLCONV void stm_store_uchar(volatile unsigned char *addr, unsigned char value)
{
  int_stm_store_u8((volatile uint8_t *)addr, (uint8_t)value);
}

_CALLCONV void stm_store_short(volatile short *addr, short value)
{
  convert_16_t val;
  val.s16 = value;
  int_stm_store_u16((volatile uint16_t *)addr, val.u16);
}

_CALLCONV void stm_store_ushort(volatile unsigned short *addr, unsigned short value)
{
  int_stm_store_u16((volatile uint16_t *)addr, (uint16_t)value);
}

_CALLCONV void stm_store_int(volatile int *addr, int value)
{
  convert_32_t val;
  val.s32 = value;
  int_stm_store_u32((volatile uint32_t *)addr, val.u32);
}

_CALLCONV void stm_store_uint(volatile unsigned int *addr, unsigned int value)
{
  int_stm_store_u32((volatile uint32_t *)addr, (uint32_t)value);
}

_CALLCONV void stm_store_long(volatile long *addr, long value)
{
  if (sizeof(long) == 4) {
    convert_32_t val;
    val.s32 = value;
    int_stm_store_u32((volatile uint32_t *)addr, val.u32);
  } else {
    convert_64_t val;
    val.s64 = value;
    int_stm_store_u64((volatile uint64_t *)addr, val.u64);
  }
}

_CALLCONV void stm_store_ulong(volatile unsigned long *addr, unsigned long value)
{
  if (sizeof(long) == 4) {
    int_stm_store_u32((volatile uint32_t *)addr, (uint32_t)value);
  } else {
    int_stm_store_u64((volatile uint64_t *)addr, (uint64_t)value);
  }
}

_CALLCONV void stm_store_float(volatile float *addr, float value)
{
  convert_32_t val;
  val.f = value;
  int_stm_store_u32((volatile uint32_t *)addr, val.u32);
}

_CALLCONV void stm_store_double(volatile double *addr, double value)
{
  convert_64_t val;
  val.d = value;
  int_stm_store_u64((volatile uint64_t *)addr, val.u64);
}

_CALLCONV void stm_store_ptr(volatile void **addr, void *value)
{
  union { stm_word_t w; void *v; } convert;
  convert.v = value;
  TM_STORE((stm_word_t *)addr, convert.w);
}

_CALLCONV void stm_store_bytes(volatile uint8_t *addr, uint8_t *buf, size_t size)
{
  convert_t val, mask;
  unsigned int i;
  stm_word_t *a;

  if (size == 0)
    return;
  i = (uintptr_t)addr & (sizeof(stm_word_t) - 1);
  if (i != 0) {
    /* First bytes */
    a = (stm_word_t *)((uintptr_t)addr & ~(uintptr_t)(sizeof(stm_word_t) - 1));
    val.w = mask.w = 0;
    for (; i < sizeof(stm_word_t) && size > 0; i++, size--) {
      mask.b[i] = 0xFF;
      val.b[i] = *buf++;
    }
    TM_STORE2(a++, val.w, mask.w);
  } else
    a = (stm_word_t *)addr;
  /* Full words */
  while (size >= sizeof(stm_word_t)) {
#ifdef ALLOW_MISALIGNED_ACCESSES
    TM_STORE(a++, *((stm_word_t *)buf));
    buf += sizeof(stm_word_t);
#else /* ! ALLOW_MISALIGNED_ACCESSES */
    for (i = 0; i < sizeof(stm_word_t); i++)
      val.b[i] = *buf++;
    TM_STORE(a++, val.w);
#endif /* ! ALLOW_MISALIGNED_ACCESSES */
    size -= sizeof(stm_word_t);
  }
  if (size > 0) {
    /* Last bytes */
    val.w = mask.w = 0;
    for (i = 0; size > 0; i++, size--) {
      mask.b[i] = 0xFF;
      val.b[i] = *buf++;
    }
    TM_STORE2(a, val.w, mask.w);
  }
}

_CALLCONV void stm_set_bytes(volatile uint8_t *addr, uint8_t byte, size_t count)
{
  convert_t val, mask;
  unsigned int i;
  stm_word_t *a;

  if (count == 0)
    return;

  for (i = 0; i < sizeof(stm_word_t); i++)
    val.b[i] = byte;

  i = (uintptr_t)addr & (sizeof(stm_word_t) - 1);
  if (i != 0) {
    /* First bytes */
    a = (stm_word_t *)((uintptr_t)addr & ~(uintptr_t)(sizeof(stm_word_t) - 1));
    mask.w = 0;
    for (; i < sizeof(stm_word_t) && count > 0; i++, count--)
      mask.b[i] = 0xFF;
    TM_STORE2(a++, val.w, mask.w);
  } else
    a = (stm_word_t *)addr;
  /* Full words */
  while (count >= sizeof(stm_word_t)) {
    TM_STORE(a++, val.w);
    count -= sizeof(stm_word_t);
  }
  if (count > 0) {
    /* Last bytes */
    mask.w = 0;
    for (i = 0; count > 0; i++, count--)
      mask.b[i] = 0xFF;
    TM_STORE2(a, val.w, mask.w);
  }
}

#undef TM_LOAD
#undef TM_STORE
#undef TM_STORE2


================================================
FILE: stms/tinystm/test/Makefile
================================================
.PHONY:	all

TESTS = bank intset regression

.PHONY:	all $(TESTS)

all:	$(TESTS)

check: 	all
	@echo Testing load/store \(regression/types\) 
	@./regression/types 1>/dev/null 2>&1
	@echo Testing irrevocability \(regression/irrevocability\)
	@./regression/irrevocability 1>/dev/null 2>&1
	@echo Testing Linked List \(intset/intset-ll\)
	@./intset/intset-ll -d 2000 1>/dev/null 2>&1
	@echo Testing Linked List with concurrency \(intset/intset-ll -n 4\)
	@./intset/intset-ll -d 2000 -n 4 1>/dev/null 2>&1
	@echo Testing Red Black Tree \(intset/intset-rb\)
	@./intset/intset-rb -d 2000 1>/dev/null 2>&1
	@echo Testing Red Black Tree with concurrency \(intset/intset-rb -n 4\)
	@./intset/intset-rb -d 2000 -n 4 1>/dev/null 2>&1
	@echo Testing Skip List \(intset/intset-sl\)
	@./intset/intset-sl -d 2000 1>/dev/null 2>&1
	@echo Testing Skip List with concurrency \(intset/intset-sl -n 4\)
	@./intset/intset-sl -d 2000 -n 4 1>/dev/null 2>&1
	@echo Testing Hash Set \(intset/intset-hs\)
	@./intset/intset-hs -d 2000 1>/dev/null 2>&1
	@echo Testing Hash Set with concurrency \(intset/intset-hs -n 4\)
	@./intset/intset-hs -d 2000 -n 4 1>/dev/null 2>&1
	@echo All tests passed

$(TESTS):
	$(MAKE) -C $@ $(TARGET)


================================================
FILE: stms/tinystm/test/intset/.gitignore
================================================
intset-ll
intset-hs
intset-sl
intset-rb


================================================
FILE: stms/tinystm/test/intset/Makefile
================================================
ROOT ?= ../..

include $(ROOT)/Makefile.common

BINS = intset-hs intset-ll intset-rb intset-sl

UNAME := $(shell uname)
ifeq ($(UNAME), SunOS)
# Solaris requires rt lib for nanosleep
LDFLAGS += -lrt
endif

.PHONY:	all clean

all:	$(BINS)

intset-hs.o:	intset.c
	$(CC) $(CPPFLAGS) $(CFLAGS) $(DEFINES) -DUSE_HASHSET -c -o $@ $<

intset-ll.o:	intset.c
	$(CC) $(CPPFLAGS) $(CFLAGS) $(DEFINES) -DUSE_LINKEDLIST -c -o $@ $<

intset-rb.o:	intset.c rbtree.c rbtree.h
	$(CC) $(CPPFLAGS) $(CFLAGS) $(DEFINES) -DUSE_RBTREE -c -o $@ $<

intset-sl.o:	intset.c
	$(CC) $(CPPFLAGS) $(CFLAGS) $(DEFINES) -DUSE_SKIPLIST -c -o $@ $<

# FIXME in case of ABI $(TMLIB) must be replaced to abi/...
$(BINS):	%:	%.o $(TMLIB)
	$(LD) -o $@ $< $(LDFLAGS)

clean:
	rm -f $(BINS) *.o


================================================
FILE: stms/tinystm/test/intset/README.rbtree
================================================
The red-black tree implementation (rbtree.c, rbtree.h, types.h) comes
from the STAMP 0.9.10 distribution available from
<http://stamp.stanford.edu>.  Files have not been modified in any way
besides enabling memory deallocation upon node release.


================================================
FILE: stms/tinystm/test/intset/intset.c
================================================
/*
 * File:
 *   intset.c
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * Description:
 *   Integer set stress test.
 *
 * Copyright (c) 2007-2014.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * This program has a dual license and can also be distributed
 * under the terms of the MIT license.
 */

#include <assert.h>
#include <getopt.h>
#include <limits.h>
#include <pthread.h>
#include <signal.h>
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <sys/time.h>
#include <time.h>

#define RO                              1
#define RW                              0

#if defined(TM_GCC) 
# include "../../abi/gcc/tm_macros.h"
#elif defined(TM_DTMC) 
# include "../../abi/dtmc/tm_macros.h"
/* Make erand48 pure for DTMC (transaction_pure should work too). */
static double tanger_wrapperpure_erand48(unsigned short int __xsubi[3]) __attribute__ ((weakref("erand48")));
#elif defined(TM_INTEL)
# include "../../abi/intel/tm_macros.h"
#elif defined(TM_ABI)
# include "../../abi/tm_macros.h"
#endif /* defined(TM_ABI) */

#if defined(TM_GCC) || defined(TM_DTMC) || defined(TM_INTEL) || defined(TM_ABI)
# define TM_COMPILER
/* Add some attributes to library function */
TM_PURE 
void exit(int status);
TM_PURE 
void perror(const char *s);
#else /* Compile with explicit calls to tinySTM */

# include "stm.h"
# include "mod_mem.h"
# include "mod_ab.h"

/*
 * Useful macros to work with transactions. Note that, to use nested
 * transactions, one should check the environment returned by
 * stm_get_env() and only call sigsetjmp() if it is not null.
 */
# define TM_START(tid, ro)                  { stm_tx_attr_t _a = {{.id = tid, .read_only = ro}}; \
                                              sigjmp_buf *_e = stm_start(_a); \
                                              if (_e != NULL) sigsetjmp(*_e, 0); 
# define TM_START_TS(ts, label)             { sigjmp_buf *_e = stm_start((stm_tx_attr_t)0); \
                                              if (_e != NULL && sigsetjmp(*_e, 0)) goto label; \
	                                      stm_set_extension(0, &ts)
# define TM_LOAD(addr)                      stm_load((stm_word_t *)addr)
# define TM_UNIT_LOAD(addr, ts)             stm_unit_load((stm_word_t *)addr, ts)
# define TM_STORE(addr, value)              stm_store((stm_word_t *)addr, (stm_word_t)value)
# define TM_UNIT_STORE(addr, value, ts)     stm_unit_store((stm_word_t *)addr, (stm_word_t)value, ts)
# define TM_COMMIT                          stm_commit(); }
# define TM_MALLOC(size)                    stm_malloc(size)
# define TM_FREE(addr)                      stm_free(addr, sizeof(*addr))
# define TM_FREE2(addr, size)               stm_free(addr, size)

# define TM_INIT                            stm_init(); mod_mem_init(0); mod_ab_init(0, NULL)
# define TM_EXIT                            stm_exit()
# define TM_INIT_THREAD                     stm_init_thread()
# define TM_EXIT_THREAD                     stm_exit_thread()

/* Annotations used in this benchmark */
# define TM_SAFE
# define TM_PURE

#endif /* Compile with explicit calls to tinySTM */

#ifdef DEBUG
# define IO_FLUSH                       fflush(NULL)
/* Note: stdio is thread-safe */
#endif

#if !(defined(USE_LINKEDLIST) || defined(USE_RBTREE) || defined(USE_SKIPLIST) || defined(USE_HASHSET))
# error "Must define USE_LINKEDLIST or USE_RBTREE or USE_SKIPLIST or USE_HASHSET"
#endif /* !(defined(USE_LINKEDLIST) || defined(USE_RBTREE) || defined(USE_SKIPLIST) || defined(USE_HASHSET)) */


#define DEFAULT_DURATION                10000
#define DEFAULT_INITIAL                 256
#define DEFAULT_NB_THREADS              1
#define DEFAULT_RANGE                   (DEFAULT_INITIAL * 2)
#define DEFAULT_SEED                    0
#define DEFAULT_UPDATE                  20

#define XSTR(s)                         STR(s)
#define STR(s)                          #s

/* ################################################################### *
 * GLOBALS
 * ################################################################### */
static volatile int stop;
static unsigned short main_seed[3];

static inline void rand_init(unsigned short *seed)
{
  seed[0] = (unsigned short)rand();
  seed[1] = (unsigned short)rand();
  seed[2] = (unsigned short)rand();
}

static inline int rand_range(int n, unsigned short *seed)
{
  /* Return a random number in range [0;n) */
  int v = (int)(erand48(seed) * n);
  assert (v >= 0 && v < n);
  return v;
}

typedef struct thread_data {
  struct intset *set;
  struct barrier *barrier;
  unsigned long nb_add;
  unsigned long nb_remove;
  unsigned long nb_contains;
  unsigned long nb_found;
#ifndef TM_COMPILER
  unsigned long nb_aborts;
  unsigned long nb_aborts_1;
  unsigned long nb_aborts_2;
  unsigned long nb_aborts_locked_read;
  unsigned long nb_aborts_locked_write;
  unsigned long nb_aborts_validate_read;
  unsigned long nb_aborts_validate_write;
  unsigned long nb_aborts_validate_commit;
  unsigned long nb_aborts_invalid_memory;
  unsigned long nb_aborts_killed;
  unsigned long locked_reads_ok;
  unsigned long locked_reads_failed;
  unsigned long max_retries;
#endif /* ! TM_COMPILER */
  unsigned short seed[3];
  int diff;
  int range;
  int update;
  int alternate;
#ifdef USE_LINKEDLIST
  int unit_tx;
#endif /* LINKEDLIST */
  char padding[64];
} thread_data_t;

#if defined(USE_LINKEDLIST)

/* ################################################################### *
 * LINKEDLIST
 * ################################################################### */

# define INIT_SET_PARAMETERS            /* Nothing */

typedef intptr_t val_t;
# define VAL_MIN                        INT_MIN
# define VAL_MAX                        INT_MAX

typedef struct node {
  val_t val;
  struct node *next;
} node_t;

typedef struct intset {
  node_t *head;
} intset_t;

TM_SAFE
static node_t *new_node(val_t val, node_t *next, int transactional)
{
  node_t *node;

  if (!transactional) {
    node = (node_t *)malloc(sizeof(node_t));
  } else {
    node = (node_t *)TM_MALLOC(sizeof(node_t));
  }
  if (node == NULL) {
    perror("malloc");
    exit(1);
  }

  node->val = val;
  node->next = next;

  return node;
}

static intset_t *set_new()
{
  intset_t *set;
  node_t *min, *max;

  if ((set = (intset_t *)malloc(sizeof(intset_t))) == NULL) {
    perror("malloc");
    exit(1);
  }
  max = new_node(VAL_MAX, NULL, 0);
  min = new_node(VAL_MIN, max, 0);
  set->head = min;

  return set;
}

static void set_delete(intset_t *set)
{
  node_t *node, *next;

  node = set->head;
  while (node != NULL) {
    next = node->next;
    free(node);
    node = next;
  }
  free(set);
}

static int set_size(intset_t *set)
{
  int size = 0;
  node_t *node;

  /* We have at least 2 elements */
  node = set->head->next;
  while (node->next != NULL) {
    size++;
    node = node->next;
  }

  return size;
}

static int set_contains(intset_t *set, val_t val, thread_data_t *td)
{
  int result;
  node_t *prev, *next;
  val_t v;

# ifdef DEBUG
  printf("++> set_contains(%d)\n", val);
  IO_FLUSH;
# endif

  if (td == NULL) {
    prev = set->head;
    next = prev->next;
    while (next->val < val) {
      prev = next;
      next = prev->next;
    }
    result = (next->val == val);
  } else if (td->unit_tx == 0) {
    TM_START(0, RO);
    prev = (node_t *)TM_LOAD(&set->head);
    next = (node_t *)TM_LOAD(&prev->next);
    while (1) {
      v = TM_LOAD(&next->val);
      if (v >= val)
        break;
      prev = next;
      next = (node_t *)TM_LOAD(&prev->next);
    }
    result = (v == val);
    TM_COMMIT;
  } 
#ifndef TM_COMPILER
  else {
    /* Unit transactions */
    stm_word_t ts, start_ts, val_ts;
  restart:
    start_ts = stm_get_clock();
    /* Head node is never removed */
    prev = (node_t *)TM_UNIT_LOAD(&set->head, &ts);
    next = (node_t *)TM_UNIT_LOAD(&prev->next, &ts);
    if (ts > start_ts)
      start_ts = ts;
    while (1) {
      v = TM_UNIT_LOAD(&next->val, &val_ts);
      if (val_ts > start_ts) {
        /* Restart traversal (could also backtrack) */
        goto restart;
      }
      if (v >= val)
        break;
      prev = next;
      next = (node_t *)TM_UNIT_LOAD(&prev->next, &ts);
      if (ts > start_ts) {
        /* Verify that node has not been modified (value and pointer are updated together) */
        TM_UNIT_LOAD(&prev->val, &val_ts);
        if (val_ts > start_ts) {
          /* Restart traversal (could also backtrack) */
          goto restart;
        }
        start_ts = ts;
      }
    }
    result = (v == val);
  }
#endif /* TM_COMPILER */

  return result;
}

static int set_add(intset_t *set, val_t val, thread_data_t *td)
{
  int result;
  node_t *prev, *next;
  val_t v;

# ifdef DEBUG
  printf("++> set_add(%d)\n", val);
  IO_FLUSH;
# endif

  if (td == NULL) {
    prev = set->head;
    next = prev->next;
    while (next->val < val) {
      prev = next;
      next = prev->next;
    }
    result = (next->val != val);
    if (result) {
      prev->next = new_node(val, next, 0);
    }
  } else if (td->unit_tx == 0) {
    TM_START(1, RW);
    prev = (node_t *)TM_LOAD(&set->head);
    next = (node_t *)TM_LOAD(&prev->next);
    while (1) {
      v = TM_LOAD(&next->val);
      if (v >= val)
        break;
      prev = next;
      next = (node_t *)TM_LOAD(&prev->next);
    }
    result = (v != val);
    if (result) {
      TM_STORE(&prev->next, new_node(val, next, 1));
    }
    TM_COMMIT;
  } 
#ifndef TM_COMPILER
  else {
    /* Unit transactions */
    stm_word_t ts, start_ts, val_ts;
  restart:
    start_ts = stm_get_clock();
    /* Head node is never removed */
    prev = (node_t *)TM_UNIT_LOAD(&set->head, &ts);
    next = (node_t *)TM_UNIT_LOAD(&prev->next, &ts);
    if (ts > start_ts)
      start_ts = ts;
    while (1) {
      v = TM_UNIT_LOAD(&next->val, &val_ts);
      if (val_ts > start_ts) {
        /* Restart traversal (could also backtrack) */
        goto restart;
      }
      if (v >= val)
        break;
      prev = next;
      next = (node_t *)TM_UNIT_LOAD(&prev->next, &ts);
      if (ts > start_ts) {
        /* Verify that node has not been modified (value and pointer are updated together) */
        TM_UNIT_LOAD(&prev->val, &val_ts);
        if (val_ts > start_ts) {
          /* Restart traversal (could also backtrack) */
          goto restart;
        }
        start_ts = ts;
      }
    }
    result = (v != val);
    if (result) {
      node_t *n = new_node(val, next, 0);
      /* Make sure that there are no concurrent updates to that memory location */
      if (!TM_UNIT_STORE(&prev->next, n, &ts)) {
        free(n);
        goto restart;
      }
    }
  }
#endif /* ! TM_COMPILER */

  return result;
}

static int set_remove(intset_t *set, val_t val, thread_data_t *td)
{
  int result;
  node_t *prev, *next;
  val_t v;
  node_t *n;

# ifdef DEBUG
  printf("++> set_remove(%d)\n", val);
  IO_FLUSH;
# endif

  if (td == NULL) {
    prev = set->head;
    next = prev->next;
    while (next->val < val) {
      prev = next;
      next = prev->next;
    }
    result = (next->val == val);
    if (result) {
      prev->next = next->next;
      free(next);
    }
  } else if (td->unit_tx == 0) {
    TM_START(2, RW);
    prev = (node_t *)TM_LOAD(&set->head);
    next = (node_t *)TM_LOAD(&prev->next);
    while (1) {
      v = TM_LOAD(&next->val);
      if (v >= val)
        break;
      prev = next;
      next = (node_t *)TM_LOAD(&prev->next);
    }
    result = (v == val);
    if (result) {
      n = (node_t *)TM_LOAD(&next->next);
      TM_STORE(&prev->next, n);
      /* Free memory (delayed until commit) */
      TM_FREE2(next, sizeof(node_t));
    }
    TM_COMMIT;
  } 
#ifndef TM_COMPILER
  else {
    /* Unit transactions */
    stm_word_t ts, start_ts, val_ts;
  restart:
    start_ts = stm_get_clock();
    /* Head node is never removed */
    prev = (node_t *)TM_UNIT_LOAD(&set->head, &ts);
    next = (node_t *)TM_UNIT_LOAD(&prev->next, &ts);
    if (ts > start_ts)
      start_ts = ts;
    while (1) {
      v = TM_UNIT_LOAD(&next->val, &val_ts);
      if (val_ts > start_ts) {
        /* Restart traversal (could also backtrack) */
        goto restart;
      }
      if (v >= val)
        break;
      prev = next;
      next = (node_t *)TM_UNIT_LOAD(&prev->next, &ts);
      if (ts > start_ts) {
        /* Verify that node has not been modified (value and pointer are updated together) */
        TM_UNIT_LOAD(&prev->val, &val_ts);
        if (val_ts > start_ts) {
          /* Restart traversal (could also backtrack) */
          goto restart;
        }
        start_ts = ts;
      }
    }
    result = (v == val);
    if (result) {
      /* Make sure that the transaction does not access versions more recent than start_ts */
      TM_START_TS(start_ts, restart);
      n = (node_t *)TM_LOAD(&next->next);
      TM_STORE(&prev->next, n);
      /* Free memory (delayed until commit) */
      TM_FREE2(next, sizeof(node_t));
      TM_COMMIT;
    }
  }
#endif /* ! TM_COMPILER */
  return result;
}

#elif defined(USE_RBTREE)

/* ################################################################### *
 * RBTREE
 * ################################################################### */
/* TODO: comparison function as a pointer should be changed for TM compiler
 * (not supported or introduce a lot of overhead). */
# define INIT_SET_PARAMETERS            /* Nothing */

# define TM_ARGDECL_ALONE               /* Nothing */
# define TM_ARGDECL                     /* Nothing */
# define TM_ARG                         /* Nothing */
# define TM_ARG_ALONE                   /* Nothing */
# define TM_CALLABLE                    TM_SAFE

# define TM_SHARED_READ(var)            TM_LOAD(&(var))
# define TM_SHARED_READ_P(var)          TM_LOAD(&(var))

# define TM_SHARED_WRITE(var, val)      TM_STORE(&(var), val)
# define TM_SHARED_WRITE_P(var, val)    TM_STORE(&(var), val)

# include "rbtree.h"

# include "rbtree.c"

typedef struct intset intset_t;
typedef intptr_t val_t;

static long compare(const void *a, const void *b)
{
  return ((val_t)a - (val_t)b);
}

static intset_t *set_new()
{
  return (intset_t *)rbtree_alloc(&compare);
}

static void set_delete(intset_t *set)
{
  rbtree_free((rbtree_t *)set);
}

static int set_size(intset_t *set)
{
  int size;
  node_t *n;

  if (!rbtree_verify((rbtree_t *)set, 0)) {
    printf("Validation failed!\n");
    exit(1);
  }

  size = 0;
  for (n = firstEntry((rbtree_t *)set); n != NULL; n = successor(n))
    size++;

  return size;
}

static int set_contains(intset_t *set, val_t val, thread_data_t *td)
{
  int result;

# ifdef DEBUG
  printf("++> set_contains(%d)\n", val);
  IO_FLUSH;
# endif

  if (!td) {
    result = rbtree_contains((rbtree_t *)set, (void *)val);
  } else {
    TM_START(0, RO);
    result = TMrbtree_contains((rbtree_t *)set, (void *)val);
    TM_COMMIT;
  }

  return result;
}

static int set_add(intset_t *set, val_t val, thread_data_t *td)
{
  int result;

# ifdef DEBUG
  printf("++> set_add(%d)\n", val);
  IO_FLUSH;
# endif

  if (!td) {
    result = rbtree_insert((rbtree_t *)set, (void *)val, (void *)val);
  } else {
    TM_START(1, RW);
    result = TMrbtree_insert((rbtree_t *)set, (void *)val, (void *)val);
    TM_COMMIT;
  }

  return result;
}

static int set_remove(intset_t *set, val_t val, thread_data_t *td)
{
  int result;

# ifdef DEBUG
  printf("++> set_remove(%d)\n", val);
  IO_FLUSH;
# endif

  if (!td) {
    result = rbtree_delete((rbtree_t *)set, (void *)val);
  } else {
    TM_START(2, RW);
    result = TMrbtree_delete((rbtree_t *)set, (void *)val);
    TM_COMMIT;
  }

  return result;
}

#elif defined(USE_SKIPLIST)

/* ################################################################### *
 * SKIPLIST
 * ################################################################### */

# define MAX_LEVEL                      64

# define INIT_SET_PARAMETERS            32, 50

typedef intptr_t val_t;
typedef intptr_t level_t;
# define VAL_MIN                        INT_MIN
# define VAL_MAX                        INT_MAX

typedef struct node {
  val_t val;
  level_t level;
  struct node *forward[1];
} node_t;

typedef struct intset {
  node_t *head;
  node_t *tail;
  level_t level;
  int prob;
  int max_level;
} intset_t;

TM_PURE
static int random_level(intset_t *set, unsigned short *seed)
{
  int l = 0;
  while (l < set->max_level && rand_range(100, seed) < set->prob)
    l++;
  return l;
}

TM_SAFE
static node_t *new_node(val_t val, level_t level, int transactional)
{
  node_t *node;

  if (!transactional) {
    node = (node_t *)malloc(sizeof(node_t) + level * sizeof(node_t *));
  } else {
    node = (node_t *)TM_MALLOC(sizeof(node_t) + level * sizeof(node_t *));
  }
  if (node == NULL) {
    perror("malloc");
    exit(1);
  }

  node->val = val;
  node->level = level;

  return node;
}

static intset_t *set_new(level_t max_level, int prob)
{
  intset_t *set;
  int i;

  assert(max_level <= MAX_LEVEL);
  assert(prob >= 0 && prob <= 100);

  if ((set = (intset_t *)malloc(sizeof(intset_t))) == NULL) {
    perror("malloc");
    exit(1);
  }
  set->max_level = max_level;
  set->prob = prob;
  set->level = 0;
  /* Set head and tail are immutable */
  set->tail = new_node(VAL_MAX, max_level, 0);
  set->head = new_node(VAL_MIN, max_level, 0);
  for (i = 0; i <= max_level; i++) {
    set->head->forward[i] = set->tail;
    set->tail->forward[i] = NULL;
  }

  return set;
}

static void set_delete(intset_t *set)
{
  node_t *node, *next;

  node = set->head;
  while (node != NULL) {
    next = node->forward[0];
    free(node);
    node = next;
  }
  free(set);
}

static int set_size(intset_t *set)
{
  int size = 0;
  node_t *node;

  /* We have at least 2 elements */
  node = set->head->forward[0];
  while (node->forward[0] != NULL) {
    size++;
    node = node->forward[0];
  }

  return size;
}

static int set_contains(intset_t *set, val_t val, thread_data_t *td)
{
  int result, i;
  node_t *node, *next;
  val_t v;

# ifdef DEBUG
  printf("++> set_contains(%d)\n", val);
  IO_FLUSH;
# endif

  if (!td) {
    node = set->head;
    for (i = set->level; i >= 0; i--) {
      next = node->forward[i];
      while (next->val < val) {
        node = next;
        next = node->forward[i];
      }
    }
    node = node->forward[0];
    result = (node->val == val);
  } else {
    TM_START(0, RO);
    v = VAL_MIN; /* Avoid compiler warning (should not be necessary) */
    node = set->head;
    for (i = TM_LOAD(&set->level); i >= 0; i--) {
      next = (node_t *)TM_LOAD(&node->forward[i]);
      while (1) {
        v = TM_LOAD(&next->val);
        if (v >= val)
          break;
        node = next;
        next = (node_t *)TM_LOAD(&node->forward[i]);
      }
    }
    result = (v == val);
    TM_COMMIT;
  }

  return result;
}

static int set_add(intset_t *set, val_t val, thread_data_t *td)
{
  int result, i;
  node_t *update[MAX_LEVEL + 1];
  node_t *node, *next;
  level_t level, l;
  val_t v;

# ifdef DEBUG
  printf("++> set_add(%d)\n", val);
  IO_FLUSH;
# endif

  if (!td) {
    node = set->head;
    for (i = set->level; i >= 0; i--) {
      next = node->forward[i];
      while (next->val < val) {
        node = next;
        next = node->forward[i];
      }
      update[i] = node;
    }
    node = node->forward[0];

    if (node->val == val) {
      result = 0;
    } else {
      l = random_level(set, main_seed);
      if (l > set->level) {
        for (i = set->level + 1; i <= l; i++)
          update[i] = set->head;
        set->level = l;
      }
      node = new_node(val, l, 0);
      for (i = 0; i <= l; i++) {
        node->forward[i] = update[i]->forward[i];
        update[i]->forward[i] = node;
      }
      result = 1;
    }
  } else {
    TM_START(1, RW);
    v = VAL_MIN; /* Avoid compiler warning (should not be necessary) */
    node = set->head;
    level = TM_LOAD(&set->level);
    for (i = level; i >= 0; i--) {
      next = (node_t *)TM_LOAD(&node->forward[i]);
      while (1) {
        v = TM_LOAD(&next->val);
        if (v >= val)
          break;
        node = next;
        next = (node_t *)TM_LOAD(&node->forward[i]);
      }
      update[i] = node;
    }

    if (v == val) {
      result = 0;
    } else {
      l = random_level(set, td->seed);
      if (l > level) {
        for (i = level + 1; i <= l; i++)
          update[i] = set->head;
        TM_STORE(&set->level, l);
      }
      node = new_node(val, l, 1);
      for (i = 0; i <= l; i++) {
        node->forward[i] = (node_t *)TM_LOAD(&update[i]->forward[i]);
        TM_STORE(&update[i]->forward[i], node);
      }
      result = 1;
    }
    TM_COMMIT;
  }

  return result;
}

static int set_remove(intset_t *set, val_t val, thread_data_t *td)
{
  int result, i;
  node_t *update[MAX_LEVEL + 1];
  node_t *node, *next;
  level_t level;
  val_t v;

# ifdef DEBUG
  printf("++> set_remove(%d)\n", val);
  IO_FLUSH;
# endif

  if (!td) {
    node = set->head;
    for (i = set->level; i >= 0; i--) {
      next = node->forward[i];
      while (next->val < val) {
        node = next;
        next = node->forward[i];
      }
      update[i] = node;
    }
    node = node->forward[0];

    if (node->val != val) {
      result = 0;
    } else {
      for (i = 0; i <= set->level; i++) {
        if (update[i]->forward[i] == node)
          update[i]->forward[i] = node->forward[i];
      }
      while (set->level > 0 && set->head->forward[set->level]->forward[0] == NULL)
        set->level--;
      free(node);
      result = 1;
    }
  } else {
    TM_START(2, RW);
    v = VAL_MIN; /* Avoid compiler warning (should not be necessary) */
    node = set->head;
    level = TM_LOAD(&set->level);
    for (i = level; i >= 0; i--) {
      next = (node_t *)TM_LOAD(&node->forward[i]);
      while (1) {
        v = TM_LOAD(&next->val);
        if (v >= val)
          break;
        node = next;
        next = (node_t *)TM_LOAD(&node->forward[i]);
      }
      update[i] = node;
    }
    node = (node_t *)TM_LOAD(&node->forward[0]);

    if (v != val) {
      result = 0;
    } else {
      for (i = 0; i <= level; i++) {
        if ((node_t *)TM_LOAD(&update[i]->forward[i]) == node)
          TM_STORE(&update[i]->forward[i], (node_t *)TM_LOAD(&node->forward[i]));
      }
      i = level;
      while (i > 0 && (node_t *)TM_LOAD(&set->head->forward[i]) == set->tail)
        i--;
      if (i != level)
        TM_STORE(&set->level, i);
      /* Free memory (delayed until commit) */
      TM_FREE2(node, sizeof(node_t) + node->level * sizeof(node_t *));
      result = 1;
    }
    TM_COMMIT;
  }

  return result;
}

#elif defined(USE_HASHSET)

/* ################################################################### *
 * HASHSET
 * ################################################################### */

# define INIT_SET_PARAMETERS            /* Nothing */

# define NB_BUCKETS                     (1UL << 17)

# define HASH(a)                        (hash((uint32_t)a) & (NB_BUCKETS - 1))

typedef intptr_t val_t;

typedef struct bucket {
  val_t val;
  struct bucket *next;
} bucket_t;

typedef struct intset {
  bucket_t **buckets;
} intset_t;

TM_PURE
static uint32_t hash(uint32_t a)
{
  /* Knuth's multiplicative hash function */
  a *= 2654435761UL;
  return a;
}

TM_SAFE
static bucket_t *new_entry(val_t val, bucket_t *next, int transactional)
{
  bucket_t *b;

  if (!transactional) {
    b = (bucket_t *)malloc(sizeof(bucket_t));
  } else {
    b = (bucket_t *)TM_MALLOC(sizeof(bucket_t));
  }
  if (b == NULL) {
    perror("malloc");
    exit(1);
  }

  b->val = val;
  b->next = next;

  return b;
}

static intset_t *set_new()
{
  intset_t *set;

  if ((set = (intset_t *)malloc(sizeof(intset_t))) == NULL) {
    perror("malloc");
    exit(1);
  }
  if ((set->buckets = (bucket_t **)calloc(NB_BUCKETS, sizeof(bucket_t *))) == NULL) {
    perror("calloc");
    exit(1);
  }

  return set;
}

static void set_delete(intset_t *set)
{
  unsigned int i;
  bucket_t *b, *next;

  for (i = 0; i < NB_BUCKETS; i++) {
    b = set->buckets[i];
    while (b != NULL) {
      next = b->next;
      free(b);
      b = next;
    }
  }
  free(set->buckets);
  free(set);
}

static int set_size(intset_t *set)
{
  int size = 0;
  unsigned int i;
  bucket_t *b;

  for (i = 0; i < NB_BUCKETS; i++) {
    b = set->buckets[i];
    while (b != NULL) {
      size++;
      b = b->next;
    }
  }

  return size;
}

static int set_contains(intset_t *set, val_t val, thread_data_t *td)
{
  int result, i;
  bucket_t *b;

# ifdef DEBUG
  printf("++> set_contains(%d)\n", val);
  IO_FLUSH;
# endif

  if (!td) {
    i = HASH(val);
    b = set->buckets[i];
    result = 0;
    while (b != NULL) {
      if (b->val == val) {
        result = 1;
        break;
      }
      b = b->next;
    }
  } else {
    TM_START(0, RO);
    i = HASH(val);
    b = (bucket_t *)TM_LOAD(&set->buckets[i]);
    result = 0;
    while (b != NULL) {
      if (TM_LOAD(&b->val) == val) {
        result = 1;
        break;
      }
      b = (bucket_t *)TM_LOAD(&b->next);
    }
    TM_COMMIT;
  }

  return result;
}

static int set_add(intset_t *set, val_t val, thread_data_t *td)
{
  int result, i;
  bucket_t *b, *first;

# ifdef DEBUG
  printf("++> set_add(%d)\n", val);
  IO_FLUSH;
# endif

  if (!td) {
    i = HASH(val);
    first = b = set->buckets[i];
    result = 1;
    while (b != NULL) {
      if (b->val == val) {
        result = 0;
        break;
      }
      b = b->next;
    }
    if (result) {
      set->buckets[i] = new_entry(val, first, 0);
    }
  } else {
    TM_START(0, RW);
    i = HASH(val);
    first = b = (bucket_t *)TM_LOAD(&set->buckets[i]);
    result = 1;
    while (b != NULL) {
      if (TM_LOAD(&b->val) == val) {
        result = 0;
        break;
      }
      b = (bucket_t *)TM_LOAD(&b->next);
    }
    if (result) {
      TM_STORE(&set->buckets[i], new_entry(val, first, 1));
    }
    TM_COMMIT;
  }

  return result;
}

static int set_remove(intset_t *set, val_t val, thread_data_t *td)
{
  int result, i;
  bucket_t *b, *prev;

# ifdef DEBUG
  printf("++> set_remove(%d)\n", val);
  IO_FLUSH;
# endif

  if (!td) {
    i = HASH(val);
    prev = b = set->buckets[i];
    result = 0;
    while (b != NULL) {
      if (b->val == val) {
        result = 1;
        break;
      }
      prev = b;
      b = b->next;
    }
    if (result) {
      if (prev == b) {
        /* First element of bucket */
        set->buckets[i] = b->next;
      } else {
        prev->next = b->next;
      }
      free(b);
    }
  } else {
    TM_START(0, RW);
    i = HASH(val);
    prev = b = (bucket_t *)TM_LOAD(&set->buckets[i]);
    result = 0;
    while (b != NULL) {
      if (TM_LOAD(&b->val) == val) {
        result = 1;
        break;
      }
      prev = b;
      b = (bucket_t *)TM_LOAD(&b->next);
    }
    if (result) {
      if (prev == b) {
        /* First element of bucket */
        TM_STORE(&set->buckets[i], TM_LOAD(&b->next));
      } else {
        TM_STORE(&prev->next, TM_LOAD(&b->next));
      }
      /* Free memory (delayed until commit) */
      TM_FREE2(b, sizeof(bucket_t));
    }
    TM_COMMIT;
  }

  return result;
}

#endif /* defined(USE_HASHSET) */

/* ################################################################### *
 * BARRIER
 * ################################################################### */

typedef struct barrier {
  pthread_cond_t complete;
  pthread_mutex_t mutex;
  int count;
  int crossing;
} barrier_t;

static void barrier_init(barrier_t *b, int n)
{
  pthread_cond_init(&b->complete, NULL);
  pthread_mutex_init(&b->mutex, NULL);
  b->count = n;
  b->crossing = 0;
}

static void barrier_cross(barrier_t *b)
{
  pthread_mutex_lock(&b->mutex);
  /* One more thread through */
  b->crossing++;
  /* If not all here, wait */
  if (b->crossing < b->count) {
    pthread_cond_wait(&b->complete, &b->mutex);
  } else {
    pthread_cond_broadcast(&b->complete);
    /* Reset for next time */
    b->crossing = 0;
  }
  pthread_mutex_unlock(&b->mutex);
}

/* ################################################################### *
 * STRESS TEST
 * ################################################################### */

static void *test(void *data)
{
  int op, val, last = -1;
  thread_data_t *d = (thread_data_t *)data;

  /* Create transaction */
  TM_INIT_THREAD;
  /* Wait on barrier */
  barrier_cross(d->barrier);

  while (stop == 0) {
    op = rand_range(100, d->seed);
    if (op < d->update) {
      if (d->alternate) {
        /* Alternate insertions and removals */
        if (last < 0) {
          /* Add random value */
          val = rand_range(d->range, d->seed) + 1;
          if (set_add(d->set, val, d)) {
            d->diff++;
            last = val;
          }
          d->nb_add++;
        } else {
          /* Remove last value */
          if (set_remove(d->set, last, d))
            d->diff--;
          d->nb_remove++;
          last = -1;
        }
      } else {
        /* Randomly perform insertions and removals */
        val = rand_range(d->range, d->seed) + 1;
        if ((op & 0x01) == 0) {
          /* Add random value */
          if (set_add(d->set, val, d))
            d->diff++;
          d->nb_add++;
        } else {
          /* Remove random value */
          if (set_remove(d->set, val, d))
            d->diff--;
          d->nb_remove++;
        }
      }
    } else {
      /* Look for random value */
      val = rand_range(d->range, d->seed) + 1;
      if (set_contains(d->set, val, d))
        d->nb_found++;
      d->nb_contains++;
    }
  }
#ifndef TM_COMPILER
  stm_get_stats("nb_aborts", &d->nb_aborts);
  stm_get_stats("nb_aborts_1", &d->nb_aborts_1);
  stm_get_stats("nb_aborts_2", &d->nb_aborts_2);
  stm_get_stats("nb_aborts_locked_read", &d->nb_aborts_locked_read);
  stm_get_stats("nb_aborts_locked_write", &d->nb_aborts_locked_write);
  stm_get_stats("nb_aborts_validate_read", &d->nb_aborts_validate_read);
  stm_get_stats("nb_aborts_validate_write", &d->nb_aborts_validate_write);
  stm_get_stats("nb_aborts_validate_commit", &d->nb_aborts_validate_commit);
  stm_get_stats("nb_aborts_invalid_memory", &d->nb_aborts_invalid_memory);
  stm_get_stats("nb_aborts_killed", &d->nb_aborts_killed);
  stm_get_stats("locked_reads_ok", &d->locked_reads_ok);
  stm_get_stats("locked_reads_failed", &d->locked_reads_failed);
  stm_get_stats("max_retries", &d->max_retries);
#endif /* ! TM_COMPILER */
  /* Free transaction */
  TM_EXIT_THREAD;

  return NULL;
}

int main(int argc, char **argv)
{
  struct option long_options[] = {
    // These options don't set a flag
    {"help",                      no_argument,       NULL, 'h'},
    {"do-not-alternate",          no_argument,       NULL, 'a'},
#ifndef TM_COMPILER
    {"contention-manager",        required_argument, NULL, 'c'},
#endif /* ! TM_COMPILER */
    {"duration",                  required_argument, NULL, 'd'},
    {"initial-size",              required_argument, NULL, 'i'},
    {"num-threads",               required_argument, NULL, 'n'},
    {"range",                     required_argument, NULL, 'r'},
    {"seed",                      required_argument, NULL, 's'},
    {"update-rate",               required_argument, NULL, 'u'},
#ifdef USE_LINKEDLIST
    {"unit-tx",                   no_argument,       NULL, 'x'},
#endif /* LINKEDLIST */
    {NULL, 0, NULL, 0}
  };

  intset_t *set;
  int i, c, val, size, ret;
  unsigned long reads, updates;
#ifndef TM_COMPILER
  char *s;
  unsigned long aborts, aborts_1, aborts_2,
    aborts_locked_read, aborts_locked_write,
    aborts_validate_read, aborts_validate_write, aborts_validate_commit,
    aborts_invalid_memory, aborts_killed,
    locked_reads_ok, locked_reads_failed, max_retries;
  stm_ab_stats_t ab_stats;
#endif /* ! TM_COMPILER */
  thread_data_t *data;
  pthread_t *threads;
  pthread_attr_t attr;
  barrier_t barrier;
  struct timeval start, end;
  struct timespec timeout;
  int duration = DEFAULT_DURATION;
  int initial = DEFAULT_INITIAL;
  int nb_threads = DEFAULT_NB_THREADS;
  int range = DEFAULT_RANGE;
  int seed = DEFAULT_SEED;
  int update = DEFAULT_UPDATE;
  int alternate = 1;
#ifndef TM_COMPILER
  char *cm = NULL;
#endif /* ! TM_COMPILER */
#ifdef USE_LINKEDLIST
  int unit_tx = 0;
#endif /* LINKEDLIST */
  sigset_t block_set;

  while(1) {
    i = 0;
    c = getopt_long(argc, argv, "ha"
#ifndef TM_COMPILER
                    "c:"
#endif /* ! TM_COMPILER */
                    "d:i:n:r:s:u:"
#ifdef USE_LINKEDLIST
                    "x"
#endif /* LINKEDLIST */
                    , long_options, &i);

    if(c == -1)
      break;

    if(c == 0 && long_options[i].flag == 0)
      c = long_options[i].val;

    switch(c) {
     case 0:
       /* Flag is automatically set */
       break;
     case 'h':
       printf("intset -- STM stress test "
#if defined(USE_LINKEDLIST)
              "(linked list)\n"
#elif defined(USE_RBTREE)
              "(red-black tree)\n"
#elif defined(USE_SKIPLIST)
              "(skip list)\n"
#elif defined(USE_HASHSET)
              "(hash set)\n"
#endif /* defined(USE_HASHSET) */
              "\n"
              "Usage:\n"
              "  intset [options...]\n"
              "\n"
              "Options:\n"
              "  -h, --help\n"
              "        Print this message\n"
              "  -a, --do-not-alternate\n"
              "        Do not alternate insertions and removals\n"
#ifndef TM_COMPILER
	      "  -c, --contention-manager <string>\n"
              "        Contention manager for resolving conflicts (default=suicide)\n"
#endif /* ! TM_COMPILER */
	      "  -d, --duration <int>\n"
              "        Test duration in milliseconds (0=infinite, default=" XSTR(DEFAULT_DURATION) ")\n"
              "  -i, --initial-size <int>\n"
              "        Number of elements to insert before test (default=" XSTR(DEFAULT_INITIAL) ")\n"
              "  -n, --num-threads <int>\n"
              "        Number of threads (default=" XSTR(DEFAULT_NB_THREADS) ")\n"
              "  -r, --range <int>\n"
              "        Range of integer values inserted in set (default=" XSTR(DEFAULT_RANGE) ")\n"
              "  -s, --seed <int>\n"
              "        RNG seed (0=time-based, default=" XSTR(DEFAULT_SEED) ")\n"
              "  -u, --update-rate <int>\n"
              "        Percentage of update transactions (default=" XSTR(DEFAULT_UPDATE) ")\n"
#ifdef USE_LINKEDLIST
              "  -x, --unit-tx\n"
              "        Use unit transactions\n"
#endif /* LINKEDLIST */
         );
       exit(0);
     case 'a':
       alternate = 0;
       break;
#ifndef TM_COMPILER
     case 'c':
       cm = optarg;
       break;
#endif /* ! TM_COMPILER */
     case 'd':
       duration = atoi(optarg);
       break;
     case 'i':
       initial = atoi(optarg);
       break;
     case 'n':
       nb_threads = atoi(optarg);
       break;
     case 'r':
       range = atoi(optarg);
       break;
     case 's':
       seed = atoi(optarg);
       break;
     case 'u':
       update = atoi(optarg);
       break;
#ifdef USE_LINKEDLIST
     case 'x':
       unit_tx++;
       break;
#endif /* LINKEDLIST */
     case '?':
       printf("Use -h or --help for help\n");
       exit(0);
     default:
       exit(1);
    }
  }

  assert(duration >= 0);
  assert(initial >= 0);
  assert(nb_threads > 0);
  assert(range > 0 && range >= initial);
  assert(update >= 0 && update <= 100);

#if defined(USE_LINKEDLIST)
  printf("Set type     : linked list\n");
#elif defined(USE_RBTREE)
  printf("Set type     : red-black tree\n");
#elif defined(USE_SKIPLIST)
  printf("Set type     : skip list\n");
#elif defined(USE_HASHSET)
  printf("Set type     : hash set\n");
#endif /* defined(USE_HASHSET) */
#ifndef TM_COMPILER
  printf("CM           : %s\n", (cm == NULL ? "DEFAULT" : cm));
#endif /* ! TM_COMPILER */
  printf("Duration     : %d\n", duration);
  printf("Initial size : %d\n", initial);
  printf("Nb threads   : %d\n", nb_threads);
  printf("Value range  : %d\n", range);
  printf("Seed         : %d\n", seed);
  printf("Update rate  : %d\n", update);
  printf("Alternate    : %d\n", alternate);
#ifdef USE_LINKEDLIST
  printf("Unit tx      : %d\n", unit_tx);
#endif /* LINKEDLIST */
  printf("Type sizes   : int=%d/long=%d/ptr=%d/word=%d\n",
         (int)sizeof(int),
         (int)sizeof(long),
         (int)sizeof(void *),
         (int)sizeof(size_t));

  timeout.tv_sec = duration / 1000;
  timeout.tv_nsec = (duration % 1000) * 1000000;

  if ((data = (thread_data_t *)malloc(nb_threads * sizeof(thread_data_t))) == NULL) {
    perror("malloc");
    exit(1);
  }
  if ((threads = (pthread_t *)malloc(nb_threads * sizeof(pthread_t))) == NULL) {
    perror("malloc");
    exit(1);
  }

  if (seed == 0)
    srand((int)time(NULL));
  else
    srand(seed);

  set = set_new(INIT_SET_PARAMETERS);

  stop = 0;

  /* Thread-local seed for main thread */
  rand_init(main_seed);

  /* Init STM */
  printf("Initializing STM\n");
  TM_INIT;

#ifndef TM_COMPILER
  if (stm_get_parameter("compile_flags", &s))
    printf("STM flags    : %s\n", s);

  if (cm != NULL) {
    if (stm_set_parameter("cm_policy", cm) == 0)
      printf("WARNING: cannot set contention manager \"%s\"\n", cm);
  }
#endif /* ! TM_COMPILER */
  if (alternate == 0 && range != initial * 2)
    printf("WARNING: range is not twice the initial set size\n");

  /* Populate set */
  printf("Adding %d entries to set\n", initial);
  i = 0;
  while (i < initial) {
    val = rand_range(range, main_seed) + 1;
    if (set_add(set, val, 0))
      i++;
  }
  size = set_size(set);
  printf("Set size     : %d\n", size);

  /* Access set from all threads */
  barrier_init(&barrier, nb_threads + 1);
  pthread_attr_init(&attr);
  pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
  for (i = 0; i < nb_threads; i++) {
    printf("Creating thread %d\n", i);
    data[i].range = range;
    data[i].update = update;
    data[i].alternate = alternate;
#ifdef USE_LINKEDLIST
    data[i].unit_tx = unit_tx;
#endif /* LINKEDLIST */
    data[i].nb_add = 0;
    data[i].nb_remove = 0;
    data[i].nb_contains = 0;
    data[i].nb_found = 0;
#ifndef TM_COMPILER
    data[i].nb_aborts = 0;
    data[i].nb_aborts_1 = 0;
    data[i].nb_aborts_2 = 0;
    data[i].nb_aborts_locked_read = 0;
    data[i].nb_aborts_locked_write = 0;
    data[i].nb_aborts_validate_read = 0;
    data[i].nb_aborts_validate_write = 0;
    data[i].nb_aborts_validate_commit = 0;
    data[i].nb_aborts_invalid_memory = 0;
    data[i].nb_aborts_killed = 0;
    data[i].locked_reads_ok = 0;
    data[i].locked_reads_failed = 0;
    data[i].max_retries = 0;
#endif /* ! TM_COMPILER */
    data[i].diff = 0;
    rand_init(data[i].seed);
    data[i].set = set;
    data[i].barrier = &barrier;
    if (pthread_create(&threads[i], &attr, test, (void *)(&data[i])) != 0) {
      fprintf(stderr, "Error creating thread\n");
      exit(1);
    }
  }
  pthread_attr_destroy(&attr);

  /* Start threads */
  barrier_cross(&barrier);

  printf("STARTING...\n");
  gettimeofday(&start, NULL);
  if (duration > 0) {
    nanosleep(&timeout, NULL);
  } else {
    sigemptyset(&block_set);
    sigsuspend(&block_set);
  }
  stop = 1;
  gettimeofday(&end, NULL);
  printf("STOPPING...\n");

  /* Wait for thread completion */
  for (i = 0; i < nb_threads; i++) {
    if (pthread_join(threads[i], NULL) != 0) {
      fprintf(stderr, "Error waiting for thread completion\n");
      exit(1);
    }
  }

  duration = (end.tv_sec * 1000 + end.tv_usec / 1000) - (start.tv_sec * 1000 + start.tv_usec / 1000);
#ifndef TM_COMPILER
  aborts = 0;
  aborts_1 = 0;
  aborts_2 = 0;
  aborts_locked_read = 0;
  aborts_locked_write = 0;
  aborts_validate_read = 0;
  aborts_validate_write = 0;
  aborts_validate_commit = 0;
  aborts_invalid_memory = 0;
  aborts_killed = 0;
  locked_reads_ok = 0;
  locked_reads_failed = 0;
  max_retries = 0;
#endif /* ! TM_COMPILER */
  reads = 0;
  updates = 0;
  for (i = 0; i < nb_threads; i++) {
    printf("Thread %d\n", i);
    printf("  #add        : %lu\n", data[i].nb_add);
    printf("  #remove     : %lu\n", data[i].nb_remove);
    printf("  #contains   : %lu\n", data[i].nb_contains);
    printf("  #found      : %lu\n", data[i].nb_found);
#ifndef TM_COMPILER
    printf("  #aborts     : %lu\n", data[i].nb_aborts);
    printf("    #lock-r   : %lu\n", data[i].nb_aborts_locked_read);
    printf("    #lock-w   : %lu\n", data[i].nb_aborts_locked_write);
    printf("    #val-r    : %lu\n", data[i].nb_aborts_validate_read);
    printf("    #val-w    : %lu\n", data[i].nb_aborts_validate_write);
    printf("    #val-c    : %lu\n", data[i].nb_aborts_validate_commit);
    printf("    #inv-mem  : %lu\n", data[i].nb_aborts_invalid_memory);
    printf("    #killed   : %lu\n", data[i].nb_aborts_killed);
    printf("  #aborts>=1  : %lu\n", data[i].nb_aborts_1);
    printf("  #aborts>=2  : %lu\n", data[i].nb_aborts_2);
    printf("  #lr-ok      : %lu\n", data[i].locked_reads_ok);
    printf("  #lr-failed  : %lu\n", data[i].locked_reads_failed);
    printf("  Max retries : %lu\n", data[i].max_retries);
    aborts += data[i].nb_aborts;
    aborts_1 += data[i].nb_aborts_1;
    aborts_2 += data[i].nb_aborts_2;
    aborts_locked_read += data[i].nb_aborts_locked_read;
    aborts_locked_write += data[i].nb_aborts_locked_write;
    aborts_validate_read += data[i].nb_aborts_validate_read;
    aborts_validate_write += data[i].nb_aborts_validate_write;
    aborts_validate_commit += data[i].nb_aborts_validate_commit;
    aborts_invalid_memory += data[i].nb_aborts_invalid_memory;
    aborts_killed += data[i].nb_aborts_killed;
    locked_reads_ok += data[i].locked_reads_ok;
    locked_reads_failed += data[i].locked_reads_failed;
    if (max_retries < data[i].max_retries)
      max_retries = data[i].max_retries;
#endif /* ! TM_COMPILER */
    reads += data[i].nb_contains;
    updates += (data[i].nb_add + data[i].nb_remove);
    size += data[i].diff;
  }
  printf("Set size      : %d (expected: %d)\n", set_size(set), size);
  ret = (set_size(set) != size);
  printf("Duration      : %d (ms)\n", duration);
  printf("#txs          : %lu (%f / s)\n", reads + updates, (reads + updates) * 1000.0 / duration);
  printf("#read txs     : %lu (%f / s)\n", reads, reads * 1000.0 / duration);
  printf("#update txs   : %lu (%f / s)\n", updates, updates * 1000.0 / duration);
#ifndef TM_COMPILER
  printf("#aborts       : %lu (%f / s)\n", aborts, aborts * 1000.0 / duration);
  printf("  #lock-r     : %lu (%f / s)\n", aborts_locked_read, aborts_locked_read * 1000.0 / duration);
  printf("  #lock-w     : %lu (%f / s)\n", aborts_locked_write, aborts_locked_write * 1000.0 / duration);
  printf("  #val-r      : %lu (%f / s)\n", aborts_validate_read, aborts_validate_read * 1000.0 / duration);
  printf("  #val-w      : %lu (%f / s)\n", aborts_validate_write, aborts_validate_write * 1000.0 / duration);
  printf("  #val-c      : %lu (%f / s)\n", aborts_validate_commit, aborts_validate_commit * 1000.0 / duration);
  printf("  #inv-mem    : %lu (%f / s)\n", aborts_invalid_memory, aborts_invalid_memory * 1000.0 / duration);
  printf("  #killed     : %lu (%f / s)\n", aborts_killed, aborts_killed * 1000.0 / duration);
  printf("#aborts>=1    : %lu (%f / s)\n", aborts_1, aborts_1 * 1000.0 / duration);
  printf("#aborts>=2    : %lu (%f / s)\n", aborts_2, aborts_2 * 1000.0 / duration);
  printf("#lr-ok        : %lu (%f / s)\n", locked_reads_ok, locked_reads_ok * 1000.0 / duration);
  printf("#lr-failed    : %lu (%f / s)\n", locked_reads_failed, locked_reads_failed * 1000.0 / duration);
  printf("Max retries   : %lu\n", max_retries);

  for (i = 0; stm_get_ab_stats(i, &ab_stats) != 0; i++) {
    printf("Atomic block  : %d\n", i);
    printf("  #samples    : %lu\n", ab_stats.samples);
    printf("  Mean        : %f\n", ab_stats.mean);
    printf("  Variance    : %f\n", ab_stats.variance);
    printf("  Min         : %f\n", ab_stats.min); 
    printf("  Max         : %f\n", ab_stats.max);
    printf("  50th perc.  : %f\n", ab_stats.percentile_50);
    printf("  90th perc.  : %f\n", ab_stats.percentile_90);
    printf("  95th perc.  : %f\n", ab_stats.percentile_95);
  }
#endif /* ! TM_COMPILER */

  /* Delete set */
  set_delete(set);

  /* Cleanup STM */
  TM_EXIT;

  free(threads);
  free(data);

  return ret;
}


================================================
FILE: stms/tinystm/test/intset/rbtree.c
================================================
/* =============================================================================
 *
 * rbtree.c
 * -- Red-black balanced binary search tree
 *
 * =============================================================================
 *
 * Copyright (C) Sun Microsystems Inc., 2006.  All Rights Reserved.
 * Authors: Dave Dice, Nir Shavit, Ori Shalev.
 *
 * STM: Transactional Locking for Disjoint Access Parallelism
 *
 * Transactional Locking II,
 * Dave Dice, Ori Shalev, Nir Shavit
 * DISC 2006, Sept 2006, Stockholm, Sweden.
 *
 * =============================================================================
 *
 * Modified by Chi Cao Minh, Aug 2006
 *
 * =============================================================================
 *
 * For the license of bayes/sort.h and bayes/sort.c, please see the header
 * of the files.
 * 
 * ------------------------------------------------------------------------
 * 
 * For the license of kmeans, please see kmeans/LICENSE.kmeans
 * 
 * ------------------------------------------------------------------------
 * 
 * For the license of ssca2, please see ssca2/COPYRIGHT
 * 
 * ------------------------------------------------------------------------
 * 
 * For the license of lib/mt19937ar.c and lib/mt19937ar.h, please see the
 * header of the files.
 * 
 * ------------------------------------------------------------------------
 * 
 * For the license of lib/rbtree.h and lib/rbtree.c, please see
 * lib/LEGALNOTICE.rbtree and lib/LICENSE.rbtree
 * 
 * ------------------------------------------------------------------------
 * 
 * Unless otherwise noted, the following license applies to STAMP files:
 * 
 * Copyright (c) 2007, Stanford University
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met:
 * 
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 * 
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in
 *       the documentation and/or other materials provided with the
 *       distribution.
 * 
 *     * Neither the name of Stanford University nor the names of its
 *       contributors may be used to endorse or promote products derived
 *       from this software without specific prior written permission.
 * 
 * THIS SOFTWARE IS PROVIDED BY STANFORD UNIVERSITY ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL STANFORD UNIVERSITY BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 * THE POSSIBILITY OF SUCH DAMAGE.
 *
 * =============================================================================
 */


#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <inttypes.h>
#include "memory.h"
#include "rbtree.h"
#include "tm.h"


typedef struct node {
    void* k;
    void* v;
    struct node* p;
    struct node* l;
    struct node* r;
    long c;
} node_t;


struct rbtree {
    node_t* root;
    /* TODO Check if set TM_PURE or use directly the function */
    TM_PURE
    long (*compare)(const void*, const void*);   /* returns {-1,0,1}, 0 -> equal */
};

#define LDA(a)              *(a)
#define STA(a,v)            *(a) = (v)
#define LDV(a)              (a)
#define STV(a,v)            (a) = (v)
#define LDF(o,f)            ((o)->f)
#define STF(o,f,v)          ((o)->f) = (v)
#define LDNODE(o,f)         ((node_t*)(LDF((o),f)))

#define TX_LDA(a)           TM_SHARED_READ(*(a))
#define TX_STA(a,v)         TM_SHARED_WRITE(*(a), v)
#define TX_LDV(a)           TM_SHARED_READ(a)
#define TX_STV(a,v)         TM_SHARED_WRITE_P(a, v)
#define TX_LDF(o,f)         ((long)TM_SHARED_READ((o)->f))
#define TX_LDF_P(o,f)       ((void*)TM_SHARED_READ_P((o)->f))
#define TX_STF(o,f,v)       TM_SHARED_WRITE((o)->f, v)
#define TX_STF_P(o,f,v)     TM_SHARED_WRITE_P((o)->f, v)
#define TX_LDNODE(o,f)      ((node_t*)(TX_LDF_P((o),f)))

/* =============================================================================
 * DECLARATION OF TM_CALLABLE FUNCTIONS
 * =============================================================================
 */

TM_CALLABLE
static node_t*
TMlookup (TM_ARGDECL  rbtree_t* s, void* k);

TM_CALLABLE
static void
TMrotateLeft (TM_ARGDECL  rbtree_t* s, node_t* x);

TM_CALLABLE
static void
TMrotateRight (TM_ARGDECL  rbtree_t* s, node_t* x);

TM_CALLABLE
static inline node_t*
TMparentOf (TM_ARGDECL  node_t* n);

TM_CALLABLE
static inline node_t*
TMleftOf (TM_ARGDECL  node_t* n);

TM_CALLABLE
static inline node_t*
TMrightOf (TM_ARGDECL  node_t* n);

TM_CALLABLE
static inline long
TMcolorOf (TM_ARGDECL  node_t* n);

TM_CALLABLE
static inline void
TMsetColor (TM_ARGDECL  node_t* n, long c);

TM_CALLABLE
static void
TMfixAfterInsertion (TM_ARGDECL  rbtree_t* s, node_t* x);

TM_CALLABLE
static node_t*
TMsuccessor  (TM_ARGDECL  node_t* t);

TM_CALLABLE
static void
TMfixAfterDeletion  (TM_ARGDECL  rbtree_t* s, node_t*  x);

TM_CALLABLE
static node_t*
TMinsert (TM_ARGDECL  rbtree_t* s, void* k, void* v, node_t* n);

TM_CALLABLE
static node_t*
TMgetNode (TM_ARGDECL_ALONE);

TM_CALLABLE
static node_t*
TMdelete (TM_ARGDECL  rbtree_t* s, node_t* p);

TM_CALLABLE
static void
TMreleaseNode (TM_ARGDECL  node_t* n);

enum {
    RED   = 0,
    BLACK = 1
};


/*
 * See also:
 * - Doug Lea's j.u.TreeMap
 * - Keir Fraser's rb_stm.c and rb_lock_serialisedwriters.c in libLtx.
 *
 * Following Doug Lea's TreeMap example, we avoid the use of the magic
 * "nil" sentinel pointers.  The sentinel is simply a convenience and
 * is not fundamental to the algorithm.  We forgo the sentinel as
 * it is a source of false+ data conflicts in transactions.  Relatedly,
 * even with locks, use of a nil sentil can result in considerable
 * cache coherency traffic on traditional SMPs.
 */


/* =============================================================================
 * lookup
 * =============================================================================
 */
static node_t*
lookup (rbtree_t* s, void* k)
{
    node_t* p = LDNODE(s, root);

    while (p != NULL) {
        long cmp = s->compare(k, LDF(p, k));
        if (cmp == 0) {
            return p;
        }
        p = ((cmp < 0) ? LDNODE(p, l) : LDNODE(p, r));
    }

    return NULL;
}
#define LOOKUP(set, key)  lookup(set, key)


/* =============================================================================
 * TMlookup
 * =============================================================================
 */
static node_t*
TMlookup (TM_ARGDECL  rbtree_t* s, void* k)
{
    node_t* p = TX_LDNODE(s, root);

    while (p != NULL) {
        long cmp = s->compare(k, TX_LDF_P(p, k));
        if (cmp == 0) {
            return p;
        }
        p = ((cmp < 0) ? TX_LDNODE(p, l) : TX_LDNODE(p, r));
    }

    return NULL;
}
#define TX_LOOKUP(set, key)  TMlookup(TM_ARG  set, key)


/*
 * Balancing operations.
 *
 * Implementations of rebalancings during insertion and deletion are
 * slightly different than the CLR version.  Rather than using dummy
 * nilnodes, we use a set of accessors that deal properly with null.  They
 * are used to avoid messiness surrounding nullness checks in the main
 * algorithms.
 *
 * From CLR
 */


/* =============================================================================
 * rotateLeft
 * =============================================================================
 */
static void
rotateLeft (rbtree_t* s, node_t* x)
{
    node_t* r = LDNODE(x, r); /* AKA r, y */
    node_t* rl = LDNODE(r, l);
    STF(x, r, rl);
    if (rl != NULL) {
        STF(rl, p, x);
    }
    /* TODO: compute p = xp = x->p.  Use xp for R-Values in following */
    node_t* xp = LDNODE(x, p);
    STF(r, p, xp);
    if (xp == NULL) {
        STF(s, root, r);
    } else if (LDNODE(xp, l) == x) {
        STF(xp, l, r);
    } else {
        STF(xp, r, r);
    }
    STF(r, l, x);
    STF(x, p, r);
}
#define ROTATE_LEFT(set, node)  rotateLeft(set, node)


/* =============================================================================
 * TMrotateLeft
 * =============================================================================
 */
static void
TMrotateLeft (TM_ARGDECL  rbtree_t* s, node_t* x)
{
    node_t* r = TX_LDNODE(x, r); /* AKA r, y */
    node_t* rl = TX_LDNODE(r, l);
    TX_STF_P(x, r, rl);
    if (rl != NULL) {
        TX_STF_P(rl, p, x);
    }
    /* TODO: compute p = xp = x->p.  Use xp for R-Values in following */
    node_t* xp = TX_LDNODE(x, p);
    TX_STF_P(r, p, xp);
    if (xp == NULL) {
        TX_STF_P(s, root, r);
    } else if (TX_LDNODE(xp, l) == x) {
        TX_STF_P(xp, l, r);
    } else {
        TX_STF_P(xp, r, r);
    }
    TX_STF_P(r, l, x);
    TX_STF_P(x, p, r);
}
#define TX_ROTATE_LEFT(set, node)  TMrotateLeft(TM_ARG  set, node)


/* =============================================================================
 * rotateRight
 * =============================================================================
 */
static void
rotateRight (rbtree_t* s, node_t* x)
{
    node_t* l = LDNODE(x, l); /* AKA l,y */
    node_t* lr = LDNODE(l, r);
    STF(x, l, lr);
    if (lr != NULL) {
        STF(lr, p, x);
    }
    node_t* xp = LDNODE(x, p);
    STF(l, p, xp);
    if (xp == NULL) {
        STF(s, root, l);
    } else if (LDNODE(xp, r) == x) {
        STF(xp, r, l);
    } else {
        STF(xp, l, l);
    }
    STF(l, r, x);
    STF(x, p, l);
}
#define ROTATE_RIGHT(set, node)  rotateRight(set, node)


/* =============================================================================
 * TMrotateRight
 * =============================================================================
 */
static void
TMrotateRight (TM_ARGDECL  rbtree_t* s, node_t* x)
{
    node_t* l = TX_LDNODE(x, l); /* AKA l,y */
    node_t* lr = TX_LDNODE(l, r);
    TX_STF_P(x, l, lr);
    if (lr != NULL) {
        TX_STF_P(lr, p, x);
    }
    node_t* xp = TX_LDNODE(x, p);
    TX_STF_P(l, p, xp);
    if (xp == NULL) {
        TX_STF_P(s, root, l);
    } else if (TX_LDNODE(xp, r) == x) {
        TX_STF_P(xp, r, l);
    } else {
        TX_STF_P(xp, l, l);
    }
    TX_STF_P(l, r, x);
    TX_STF_P(x, p, l);
}
#define TX_ROTATE_RIGHT(set, node)  TMrotateRight(TM_ARG  set, node)


/* =============================================================================
 * parentOf
 * =============================================================================
 */
static inline node_t*
parentOf (node_t* n)
{
   return (n ? LDNODE(n,p) : NULL);
}
#define PARENT_OF(n) parentOf(n)


/* =============================================================================
 * TMparentOf
 * =============================================================================
 */
static inline node_t*
TMparentOf (TM_ARGDECL  node_t* n)
{
   return (n ? TX_LDNODE(n,p) : NULL);
}
#define TX_PARENT_OF(n)  TMparentOf(TM_ARG  n)


/* =============================================================================
 * leftOf
 * =============================================================================
 */
static inline node_t*
leftOf (node_t* n)
{
   return (n ? LDNODE(n, l) : NULL);
}
#define LEFT_OF(n)  leftOf(n)


/* =============================================================================
 * TMleftOf
 * =============================================================================
 */
static inline node_t*
TMleftOf (TM_ARGDECL  node_t* n)
{
   return (n ? TX_LDNODE(n, l) : NULL);
}
#define TX_LEFT_OF(n)  TMleftOf(TM_ARG  n)


/* =============================================================================
 * rightOf
 * =============================================================================
 */
static inline node_t*
rightOf (node_t* n)
{
    return (n ? LDNODE(n, r) : NULL);
}
#define RIGHT_OF(n)  rightOf(n)


/* =============================================================================
 * TMrightOf
 * =============================================================================
 */
static inline node_t*
TMrightOf (TM_ARGDECL  node_t* n)
{
    return (n ? TX_LDNODE(n, r) : NULL);
}
#define TX_RIGHT_OF(n)  TMrightOf(TM_ARG  n)


/* =============================================================================
 * colorOf
 * =============================================================================
 */
static inline long
colorOf (node_t* n)
{
    return (n ? (long)LDNODE(n, c) : BLACK);
}
#define COLOR_OF(n)  colorOf(n)


/* =============================================================================
 * TMcolorOf
 * =============================================================================
 */
static inline long
TMcolorOf (TM_ARGDECL  node_t* n)
{
    return (n ? (long)TX_LDF(n, c) : BLACK);
}
#define TX_COLOR_OF(n)  TMcolorOf(TM_ARG  n)


/* =============================================================================
 * setColor
 * =============================================================================
 */
static inline void
setColor (node_t* n, long c)
{
    if (n != NULL) {
        STF(n, c, c);
    }
}
#define SET_COLOR(n, c)  setColor(n, c)


/* =============================================================================
 * TMsetColor
 * =============================================================================
 */
static inline void
TMsetColor (TM_ARGDECL  node_t* n, long c)
{
    if (n != NULL) {
        TX_STF(n, c, c);
    }
}
#define TX_SET_COLOR(n, c)  TMsetColor(TM_ARG  n, c)


/* =============================================================================
 * fixAfterInsertion
 * =============================================================================
 */
static void
fixAfterInsertion (rbtree_t* s, node_t* x)
{
    STF(x, c, RED);
    while (x != NULL && x != LDNODE(s, root)) {
        node_t* xp = LDNODE(x, p);
        if (LDF(xp, c) != RED) {
            break;
        }
        /* TODO: cache g = ppx = PARENT_OF(PARENT_OF(x)) */
        if (PARENT_OF(x) == LEFT_OF(PARENT_OF(PARENT_OF(x)))) {
            node_t*  y = RIGHT_OF(PARENT_OF(PARENT_OF(x)));
            if (COLOR_OF(y) == RED) {
                SET_COLOR(PARENT_OF(x), BLACK);
                SET_COLOR(y, BLACK);
                SET_COLOR(PARENT_OF(PARENT_OF(x)), RED);
                x = PARENT_OF(PARENT_OF(x));
            } else {
                if (x == RIGHT_OF(PARENT_OF(x))) {
                    x = PARENT_OF(x);
                    ROTATE_LEFT(s, x);
                }
                SET_COLOR(PARENT_OF(x), BLACK);
                SET_COLOR(PARENT_OF(PARENT_OF(x)), RED);
                if (PARENT_OF(PARENT_OF(x)) != NULL) {
                    ROTATE_RIGHT(s, PARENT_OF(PARENT_OF(x)));
                }
            }
        } else {
            node_t* y = LEFT_OF(PARENT_OF(PARENT_OF(x)));
            if (COLOR_OF(y) == RED) {
                SET_COLOR(PARENT_OF(x), BLACK);
                SET_COLOR(y, BLACK);
                SET_COLOR(PARENT_OF(PARENT_OF(x)), RED);
                x = PARENT_OF(PARENT_OF(x));
            } else {
                if (x == LEFT_OF(PARENT_OF(x))) {
                    x = PARENT_OF(x);
                    ROTATE_RIGHT(s, x);
                }
                SET_COLOR(PARENT_OF(x),  BLACK);
                SET_COLOR(PARENT_OF(PARENT_OF(x)), RED);
                if (PARENT_OF(PARENT_OF(x)) != NULL) {
                    ROTATE_LEFT(s, PARENT_OF(PARENT_OF(x)));
                }
            }
        }
    }
    node_t* ro = LDNODE(s, root);
    if (LDF(ro, c) != BLACK) {
        STF(ro, c, BLACK);
    }
}
#define FIX_AFTER_INSERTION(s, x)  fixAfterInsertion(s, x)


/* =============================================================================
 * TMfixAfterInsertion
 * =============================================================================
 */
static void
TMfixAfterInsertion (TM_ARGDECL  rbtree_t* s, node_t* x)
{
    TX_STF(x, c, RED);
    while (x != NULL && x != TX_LDNODE(s, root)) {
        node_t* xp = TX_LDNODE(x, p);
        if (TX_LDF(xp, c) != RED) {
            break;
        }
        /* TODO: cache g = ppx = TX_PARENT_OF(TX_PARENT_OF(x)) */
        if (TX_PARENT_OF(x) == TX_LEFT_OF(TX_PARENT_OF(TX_PARENT_OF(x)))) {
            node_t*  y = TX_RIGHT_OF(TX_PARENT_OF(TX_PARENT_OF(x)));
            if (TX_COLOR_OF(y) == RED) {
                TX_SET_COLOR(TX_PARENT_OF(x), BLACK);
                TX_SET_COLOR(y, BLACK);
                TX_SET_COLOR(TX_PARENT_OF(TX_PARENT_OF(x)), RED);
                x = TX_PARENT_OF(TX_PARENT_OF(x));
            } else {
                if (x == TX_RIGHT_OF(TX_PARENT_OF(x))) {
                    x = TX_PARENT_OF(x);
                    TX_ROTATE_LEFT(s, x);
                }
                TX_SET_COLOR(TX_PARENT_OF(x), BLACK);
                TX_SET_COLOR(TX_PARENT_OF(TX_PARENT_OF(x)), RED);
                if (TX_PARENT_OF(TX_PARENT_OF(x)) != NULL) {
                    TX_ROTATE_RIGHT(s, TX_PARENT_OF(TX_PARENT_OF(x)));
                }
            }
        } else {
            node_t* y = TX_LEFT_OF(TX_PARENT_OF(TX_PARENT_OF(x)));
            if (TX_COLOR_OF(y) == RED) {
                TX_SET_COLOR(TX_PARENT_OF(x), BLACK);
                TX_SET_COLOR(y, BLACK);
                TX_SET_COLOR(TX_PARENT_OF(TX_PARENT_OF(x)), RED);
                x = TX_PARENT_OF(TX_PARENT_OF(x));
            } else {
                if (x == TX_LEFT_OF(TX_PARENT_OF(x))) {
                    x = TX_PARENT_OF(x);
                    TX_ROTATE_RIGHT(s, x);
                }
                TX_SET_COLOR(TX_PARENT_OF(x),  BLACK);
                TX_SET_COLOR(TX_PARENT_OF(TX_PARENT_OF(x)), RED);
                if (TX_PARENT_OF(TX_PARENT_OF(x)) != NULL) {
                    TX_ROTATE_LEFT(s, TX_PARENT_OF(TX_PARENT_OF(x)));
                }
            }
        }
    }
    node_t* ro = TX_LDNODE(s, root);
    if (TX_LDF(ro, c) != BLACK) {
        TX_STF(ro, c, BLACK);
    }
}
#define TX_FIX_AFTER_INSERTION(s, x)  TMfixAfterInsertion(TM_ARG  s, x)


/* =============================================================================
 * insert
 * =============================================================================
 */
static node_t*
insert (rbtree_t* s, void* k, void* v, node_t* n)
{
    node_t* t  = LDNODE(s, root);
    if (t == NULL) {
        if (n == NULL) {
            return NULL;
        }
        /* Note: the following STs don't really need to be transactional */
        STF(n, l, NULL);
        STF(n, r, NULL);
        STF(n, p, NULL);
        STF(n, k, k);
        STF(n, v, v);
        STF(n, c, BLACK);
        STF(s, root, n);
        return NULL;
    }

    for (;;) {
        long cmp = s->compare(k, LDF(t, k));
        if (cmp == 0) {
            return t;
        } else if (cmp < 0) {
            node_t* tl = LDNODE(t, l);
            if (tl != NULL) {
                t = tl;
            } else {
                STF(n, l, NULL);
                STF(n, r, NULL);
                STF(n, k, k);
                STF(n, v, v);
                STF(n, p, t);
                STF(t, l, n);
                FIX_AFTER_INSERTION(s, n);
                return NULL;
            }
        } else { /* cmp > 0 */
            node_t* tr = LDNODE(t, r);
            if (tr != NULL) {
                t = tr;
            } else {
                STF(n, l, NULL);
                STF(n, r, NULL);
                STF(n, k, k);
                STF(n, v, v);
                STF(n, p, t);
                STF(t, r, n);
                FIX_AFTER_INSERTION(s, n);
                return NULL;
            }
        }
    }
}
#define INSERT(s, k, v, n)  insert(s, k, v, n)


/* =============================================================================
 * TMinsert
 * =============================================================================
 */
static node_t*
TMinsert (TM_ARGDECL  rbtree_t* s, void* k, void* v, node_t* n)
{
    node_t* t  = TX_LDNODE(s, root);
    if (t == NULL) {
        if (n == NULL) {
            return NULL;
        }
        /* Note: the following STs don't really need to be transactional */
        TX_STF_P(n, l, (node_t*)NULL);
        TX_STF_P(n, r, (node_t*)NULL);
        TX_STF_P(n, p, (node_t*)NULL);
        TX_STF(n, k, k);
        TX_STF(n, v, v);
        TX_STF(n, c, BLACK);
        TX_STF_P(s, root, n);
        return NULL;
    }

    for (;;) {
        long cmp = s->compare(k, TX_LDF_P(t, k));
        if (cmp == 0) {
            return t;
        } else if (cmp < 0) {
            node_t* tl = TX_LDNODE(t, l);
            if (tl != NULL) {
                t = tl;
            } else {
                TX_STF_P(n, l, (node_t*)NULL);
                TX_STF_P(n, r, (node_t*)NULL);
                TX_STF(n, k, k);
                TX_STF(n, v, v);
                TX_STF_P(n, p, t);
                TX_STF_P(t, l, n);
                TX_FIX_AFTER_INSERTION(s, n);
                return NULL;
            }
        } else { /* cmp > 0 */
            node_t* tr = TX_LDNODE(t, r);
            if (tr != NULL) {
                t = tr;
            } else {
                TX_STF_P(n, l, (node_t*)NULL);
                TX_STF_P(n, r, (node_t*)NULL);
                TX_STF(n, k, k);
                TX_STF(n, v, v);
                TX_STF_P(n, p, t);
                TX_STF_P(t, r, n);
                TX_FIX_AFTER_INSERTION(s, n);
                return NULL;
            }
        }
    }
}
#define TX_INSERT(s, k, v, n)  TMinsert(TM_ARG  s, k, v, n)


/*
 * Return the given node's successor node---the node which has the
 * next key in the the left to right ordering. If the node has
 * no successor, a null pointer is returned rather than a pointer to
 * the nil node
 */


/* =============================================================================
 * successor
 * =============================================================================
 */
static node_t*
successor (node_t* t)
{
    if (t == NULL) {
        return NULL;
    } else if (LDNODE(t, r) != NULL) {
        node_t* p = LDNODE(t, r);
        while (LDNODE(p, l) != NULL) {
            p = LDNODE(p, l);
        }
        return p;
    } else {
        node_t* p = LDNODE(t, p);
        node_t* ch = t;
        while (p != NULL && ch == LDNODE(p, r)) {
            ch = p;
            p = LDNODE(p, p);
        }
        return p;
    }
}
#define SUCCESSOR(n)  successor(n)


/* =============================================================================
 * TMsuccessor
 * =============================================================================
 */
static node_t*
TMsuccessor  (TM_ARGDECL  node_t* t)
{
    if (t == NULL) {
        return NULL;
    } else if (TX_LDNODE(t, r) != NULL) {
        node_t* p = TX_LDNODE(t,r);
        while (TX_LDNODE(p, l) != NULL) {
            p = TX_LDNODE(p, l);
        }
        return p;
    } else {
        node_t* p = TX_LDNODE(t, p);
        node_t* ch = t;
        while (p != NULL && ch == TX_LDNODE(p, r)) {
            ch = p;
            p = TX_LDNODE(p, p);
        }
        return p;
    }
}
#define TX_SUCCESSOR(n)  TMsuccessor(TM_ARG  n)


/* =============================================================================
 * fixAfterDeletion
 * =============================================================================
 */
static void
fixAfterDeletion (rbtree_t* s, node_t* x)
{
    while (x != LDNODE(s,root) && COLOR_OF(x) == BLACK) {
        if (x == LEFT_OF(PARENT_OF(x))) {
            node_t* sib = RIGHT_OF(PARENT_OF(x));
            if (COLOR_OF(sib) == RED) {
                SET_COLOR(sib, BLACK);
                SET_COLOR(PARENT_OF(x), RED);
                ROTATE_LEFT(s, PARENT_OF(x));
                sib = RIGHT_OF(PARENT_OF(x));
            }
            if (COLOR_OF(LEFT_OF(sib)) == BLACK &&
                COLOR_OF(RIGHT_OF(sib)) == BLACK) {
                SET_COLOR(sib, RED);
                x = PARENT_OF(x);
            } else {
                if (COLOR_OF(RIGHT_OF(sib)) == BLACK) {
                    SET_COLOR(LEFT_OF(sib), BLACK);
                    SET_COLOR(sib, RED);
                    ROTATE_RIGHT(s, sib);
                    sib = RIGHT_OF(PARENT_OF(x));
                }
                SET_COLOR(sib, COLOR_OF(PARENT_OF(x)));
                SET_COLOR(PARENT_OF(x), BLACK);
                SET_COLOR(RIGHT_OF(sib), BLACK);
                ROTATE_LEFT(s, PARENT_OF(x));
                /* TODO: consider break ... */
                x = LDNODE(s,root);
            }
        } else { /* symmetric */
            node_t* sib = LEFT_OF(PARENT_OF(x));
            if (COLOR_OF(sib) == RED) {
                SET_COLOR(sib, BLACK);
                SET_COLOR(PARENT_OF(x), RED);
                ROTATE_RIGHT(s, PARENT_OF(x));
                sib = LEFT_OF(PARENT_OF(x));
            }
            if (COLOR_OF(RIGHT_OF(sib)) == BLACK &&
                COLOR_OF(LEFT_OF(sib)) == BLACK) {
                SET_COLOR(sib,  RED);
                x = PARENT_OF(x);
            } else {
                if (COLOR_OF(LEFT_OF(sib)) == BLACK) {
                    SET_COLOR(RIGHT_OF(sib), BLACK);
                    SET_COLOR(sib, RED);
                    ROTATE_LEFT(s, sib);
                    sib = LEFT_OF(PARENT_OF(x));
                }
                SET_COLOR(sib, COLOR_OF(PARENT_OF(x)));
                SET_COLOR(PARENT_OF(x), BLACK);
                SET_COLOR(LEFT_OF(sib), BLACK);
                ROTATE_RIGHT(s, PARENT_OF(x));
                /* TODO: consider break ... */
                x = LDNODE(s, root);
            }
        }
    }

    if (x != NULL && LDF(x,c) != BLACK) {
       STF(x, c, BLACK);
    }
}
#define FIX_AFTER_DELETION(s, n)  fixAfterDeletion(s, n)


/* =============================================================================
 * TMfixAfterDeletion
 * =============================================================================
 */
static void
TMfixAfterDeletion  (TM_ARGDECL  rbtree_t* s, node_t* x)
{
    while (x != TX_LDNODE(s,root) && TX_COLOR_OF(x) == BLACK) {
        if (x == TX_LEFT_OF(TX_PARENT_OF(x))) {
            node_t* sib = TX_RIGHT_OF(TX_PARENT_OF(x));
            if (TX_COLOR_OF(sib) == RED) {
                TX_SET_COLOR(sib, BLACK);
                TX_SET_COLOR(TX_PARENT_OF(x), RED);
                TX_ROTATE_LEFT(s, TX_PARENT_OF(x));
                sib = TX_RIGHT_OF(TX_PARENT_OF(x));
            }
            if (TX_COLOR_OF(TX_LEFT_OF(sib)) == BLACK &&
                TX_COLOR_OF(TX_RIGHT_OF(sib)) == BLACK) {
                TX_SET_COLOR(sib, RED);
                x = TX_PARENT_OF(x);
            } else {
                if (TX_COLOR_OF(TX_RIGHT_OF(sib)) == BLACK) {
                    TX_SET_COLOR(TX_LEFT_OF(sib), BLACK);
                    TX_SET_COLOR(sib, RED);
                    TX_ROTATE_RIGHT(s, sib);
                    sib = TX_RIGHT_OF(TX_PARENT_OF(x));
                }
                TX_SET_COLOR(sib, TX_COLOR_OF(TX_PARENT_OF(x)));
                TX_SET_COLOR(TX_PARENT_OF(x), BLACK);
                TX_SET_COLOR(TX_RIGHT_OF(sib), BLACK);
                TX_ROTATE_LEFT(s, TX_PARENT_OF(x));
                /* TODO: consider break ... */
                x = TX_LDNODE(s,root);
            }
        } else { /* symmetric */
            node_t* sib = TX_LEFT_OF(TX_PARENT_OF(x));

            if (TX_COLOR_OF(sib) == RED) {
                TX_SET_COLOR(sib, BLACK);
                TX_SET_COLOR(TX_PARENT_OF(x), RED);
                TX_ROTATE_RIGHT(s, TX_PARENT_OF(x));
                sib = TX_LEFT_OF(TX_PARENT_OF(x));
            }
            if (TX_COLOR_OF(TX_RIGHT_OF(sib)) == BLACK &&
                TX_COLOR_OF(TX_LEFT_OF(sib)) == BLACK) {
                TX_SET_COLOR(sib,  RED);
                x = TX_PARENT_OF(x);
            } else {
                if (TX_COLOR_OF(TX_LEFT_OF(sib)) == BLACK) {
                    TX_SET_COLOR(TX_RIGHT_OF(sib), BLACK);
                    TX_SET_COLOR(sib, RED);
                    TX_ROTATE_LEFT(s, sib);
                    sib = TX_LEFT_OF(TX_PARENT_OF(x));
                }
                TX_SET_COLOR(sib, TX_COLOR_OF(TX_PARENT_OF(x)));
                TX_SET_COLOR(TX_PARENT_OF(x), BLACK);
                TX_SET_COLOR(TX_LEFT_OF(sib), BLACK);
                TX_ROTATE_RIGHT(s, TX_PARENT_OF(x));
                /* TODO: consider break ... */
                x = TX_LDNODE(s, root);
            }
        }
    }

    if (x != NULL && TX_LDF(x,c) != BLACK) {
       TX_STF(x, c, BLACK);
    }
}
#define TX_FIX_AFTER_DELETION(s, n)  TMfixAfterDeletion(TM_ARG  s, n )


/* =============================================================================
 * delete_node
 * =============================================================================
 */
static node_t*
delete_node (rbtree_t* s, node_t* p)
{
    /*
     * If strictly internal, copy successor's element to p and then make p
     * point to successor
     */
    if (LDNODE(p, l) != NULL && LDNODE(p, r) != NULL) {
        node_t* s = SUCCESSOR(p);
        STF(p, k, LDNODE(s, k));
        STF(p, v, LDNODE(s, v));
        p = s;
    } /* p has 2 children */

    /* Start fixup at replacement node, if it exists */
    node_t* replacement =
        ((LDNODE(p, l) != NULL) ? LDNODE(p, l) : LDNODE(p, r));

    if (replacement != NULL) {
        /* Link replacement to parent */
        /* TODO: precompute pp = p->p and substitute below ... */
        STF (replacement, p, LDNODE(p, p));
        node_t* pp = LDNODE(p, p);
        if (pp == NULL) {
            STF(s, root, replacement);
        } else if (p == LDNODE(pp, l)) {
            STF(pp, l, replacement);
        } else {
            STF(pp, r, replacement);
        }

        /* Null out links so they are OK to use by fixAfterDeletion */
        STF(p, l, NULL);
        STF(p, r, NULL);
        STF(p, p, NULL);

        /* Fix replacement */
        if (LDF(p,c) == BLACK) {
            FIX_AFTER_DELETION(s, replacement);
        }
    } else if (LDNODE(p, p) == NULL) { /* return if we are the only node */
        STF(s, root, NULL);
    } else { /* No children. Use self as phantom replacement and unlink */
        if (LDF(p, c) == BLACK) {
            FIX_AFTER_DELETION(s, p);
        }
        node_t* pp = LDNODE(p, p);
        if (pp != NULL) {
            if (p == LDNODE(pp, l)) {
                STF(pp,l, NULL);
            } else if (p == LDNODE(pp, r)) {
                STF(pp, r, NULL);
            }
            STF(p, p, NULL);
        }
    }
    return p;
}
#define DELETE(s, n)  delete_node(s, n)


/* =============================================================================
 * TMdelete
 * =============================================================================
 */
static node_t*
TMdelete (TM_ARGDECL  rbtree_t* s, node_t* p)
{
    /*
     * If strictly internal, copy successor's element to p and then make p
     * point to successor
     */
    if (TX_LDNODE(p, l) != NULL && TX_LDNODE(p, r) != NULL) {
        node_t* s = TX_SUCCESSOR(p);
        TX_STF(p,k, TX_LDF_P(s, k));
        TX_STF(p,v, TX_LDF_P(s, v));
        p = s;
    } /* p has 2 children */

    /* Start fixup at replacement node, if it exists */
    node_t* replacement =
        ((TX_LDNODE(p, l) != NULL) ? TX_LDNODE(p, l) : TX_LDNODE(p, r));

    if (replacement != NULL) {
        /* Link replacement to parent */
        /* TODO: precompute pp = p->p and substitute below ... */
        TX_STF_P(replacement, p, TX_LDNODE(p, p));
        node_t* pp = TX_LDNODE(p, p);
        if (pp == NULL) {
            TX_STF_P(s, root, replacement);
        } else if (p == TX_LDNODE(pp, l)) {
            TX_STF_P(pp, l, replacement);
        } else {
            TX_STF_P(pp, r, replacement);
        }

        /* Null out links so they are OK to use by fixAfterDeletion */
        TX_STF_P(p, l, (node_t*)NULL);
        TX_STF_P(p, r, (node_t*)NULL);
        TX_STF_P(p, p, (node_t*)NULL);

        /* Fix replacement */
        if (TX_LDF(p,c) == BLACK) {
            TX_FIX_AFTER_DELETION(s, replacement);
        }
    } else if (TX_LDNODE(p,p) == NULL) { /* return if we are the only node */
        TX_STF_P(s, root, (node_t*)NULL);
    } else { /* No children. Use self as phantom replacement and unlink */
        if (TX_LDF(p,c) == BLACK) {
            TX_FIX_AFTER_DELETION(s, p);
        }
        node_t* pp = TX_LDNODE(p, p);
        if (pp != NULL) {
            if (p == TX_LDNODE(pp, l)) {
                TX_STF_P(pp,l, (node_t*)NULL);
            } else if (p == TX_LDNODE(pp, r)) {
                TX_STF_P(pp, r, (node_t*)NULL);
            }
            TX_STF_P(p, p, (node_t*)NULL);
        }
    }
    return p;
}
#define TX_DELETE(s, n)  TMdelete(TM_ARG  s, n)


/*
 * Diagnostic section
 */


/* =============================================================================
 * firstEntry
 * =============================================================================
 */
static node_t*
firstEntry (rbtree_t* s)
{
    node_t* p = s->root;
    if (p != NULL) {
        while (p->l != NULL) {
            p = p->l;
        }
    }
    return p;
}


#if 0
/* =============================================================================
 * predecessor
 * =============================================================================
 */
static node_t*
predecessor (node_t* t)
{
    if (t == NULL)
        return NULL;
    else if (t->l != NULL) {
        node_t* p = t->l;
        while (p->r != NULL) {
            p = p->r;
        }
        return p;
    } else {
        node_t* p = t->p;
        node_t* ch = t;
        while (p != NULL && ch == p->l) {
            ch = p;
            p = p->p;
        }
        return p;
    }
}
#endif


/*
 * Compute the BH (BlackHeight) and validate the tree.
 *
 * This function recursively verifies that the given binary subtree satisfies
 * three of the red black properties. It checks that every red node has only
 * black children. It makes sure that each node is either red or black. And it
 * checks that every path has the same count of black nodes from root to leaf.
 * It returns the blackheight of the given subtree; this allows blackheights to
 * be computed recursively and compared for left and right siblings for
 * mismatches. It does not check for every nil node being black, because there
 * is only one sentinel nil node. The return value of this function is the
 * black height of the subtree rooted at the node ``root'', or zero if the
 * subtree is not red-black.
 *
 */


/* =============================================================================
 * verifyRedBlack
 * =============================================================================
 */
static long
verifyRedBlack (node_t* root, long depth)
{
    long height_left;
    long height_right;

    if (root == NULL) {
        return 1;
    }

    height_left  = verifyRedBlack(root->l, depth+1);
    height_right = verifyRedBlack(root->r, depth+1);
    if (height_left == 0 || height_right == 0) {
        return 0;
    }
    if (height_left != height_right) {
        printf(" Imbalance @depth=%ld : %ld %ld\n", depth, height_left, height_right);
    }

    if (root->l != NULL && root->l->p != root) {
       printf(" lineage\n");
    }
    if (root->r != NULL && root->r->p != root) {
       printf(" lineage\n");
    }

    /* Red-Black alternation */
    if (root->c == RED) {
        if (root->l != NULL && root->l->c != BLACK) {
          printf("VERIFY %d\n", __LINE__);
          return 0;
        }
        if (root->r != NULL && root->r->c != BLACK) {
          printf("VERIFY %d\n", __LINE__);
          return 0;
        }
        return height_left;
    }
    if (root->c != BLACK) {
        printf("VERIFY %d\n", __LINE__);
        return 0;
    }

    return (height_left + 1);
}


/* =============================================================================
 * rbtree_verify
 * =============================================================================
 */
long
rbtree_verify (rbtree_t* s, long verbose)
{
    node_t* root = s->root;
    if (root == NULL) {
        return 1;
    }
    if (verbose) {
       printf("Integrity check: ");
    }

    if (root->p != NULL) {
        printf("  (WARNING) root %lX parent=%lX\n",
               (unsigned long)root, (unsigned long)root->p);
        return -1;
    }
    if (root->c != BLACK) {
        printf("  (WARNING) root %lX color=%lX\n",
               (unsigned long)root, (unsigned long)root->c);
    }

    /* Weak check of binary-tree property */
    long ctr = 0;
    node_t* its = firstEntry(s);
    while (its != NULL) {
        ctr++;
        node_t* child = its->l;
        if (child != NULL && child->p != its) {
            printf("Bad parent\n");
        }
        child = its->r;
        if (child != NULL && child->p != its) {
            printf("Bad parent\n");
        }
        node_t* nxt = successor(its);
        if (nxt == NULL) {
            break;
        }
        if (s->compare(its->k, nxt->k) >= 0) {
            printf("Key order %lX (%ld %ld) %lX (%ld %ld)\n",
                   (unsigned long)its, (long)its->k, (long)its->v,
                   (unsigned long)nxt, (long)nxt->k, (long)nxt->v);
            return -3;
        }
        its = nxt;
    }

    long vfy = verifyRedBlack(root, 0);
    if (verbose) {
        printf(" Nodes=%ld Depth=%ld\n", ctr, vfy);
    }

    return vfy;
}


/* =============================================================================
 * compareKeysDefault
 * =============================================================================
 */
static long
compareKeysDefault (const void* a, const void* b)
{
    return ((long)a - (long)b);
}


/* =============================================================================
 * rbtree_alloc
 * =============================================================================
 */
rbtree_t*
rbtree_alloc (long (*compare)(const void*, const void*))
{
    rbtree_t* n = (rbtree_t* )malloc(sizeof(*n));
    if (n) {
        n->compare = (compare ? compare : &compareKeysDefault);
        n->root = NULL;
    }
    return n;
}


/* =============================================================================
 * TMrbtree_alloc
 * =============================================================================
 */
rbtree_t*
TMrbtree_alloc (TM_ARGDECL  long (*compare)(const void*, const void*))
{
    rbtree_t* n = (rbtree_t* )TM_MALLOC(sizeof(*n));
    if (n){
        n->compare = (compare ? compare : &compareKeysDefault);
        n->root = NULL;
    }
    return n;
}


/* =============================================================================
 * releaseNode
 * =============================================================================
 */
static void
releaseNode (node_t* n)
{
#ifndef SIMULATOR
    free(n);
#endif    
}


/* =============================================================================
 * TMreleaseNode
 * =============================================================================
 */
static void
TMreleaseNode  (TM_ARGDECL  node_t* n)
{
    TM_FREE(n);
}


/* =============================================================================
 * freeNode
 * =============================================================================
 */
static void
freeNode (node_t* n)
{
    if (n) {
        freeNode(n->l);
        freeNode(n->r);
        releaseNode(n);
    }
}


/* =============================================================================
 * TMfreeNode
 * =============================================================================
 */
static void
TMfreeNode (TM_ARGDECL  node_t* n)
{
    if (n) {
        TMfreeNode(TM_ARG  n->l);
        TMfreeNode(TM_ARG  n->r);
        TMreleaseNode(TM_ARG  n);
    }
}


/* =============================================================================
 * rbtree_free
 * =============================================================================
 */
void
rbtree_free (rbtree_t* r)
{
    freeNode(r->root);
    free(r);
}


/* =============================================================================
 * TMrbtree_free
 * =============================================================================
 */
void
TMrbtree_free (TM_ARGDECL  rbtree_t* r)
{
    TMfreeNode(TM_ARG  r->root);
    TM_FREE(r);
}


/* =============================================================================
 * getNode
 * =============================================================================
 */
static node_t*
getNode ()
{
    node_t* n = (node_t*)malloc(sizeof(*n));
    return n;
}


/* =============================================================================
 * TMgetNode
 * =============================================================================
 */
static node_t*
TMgetNode (TM_ARGDECL_ALONE)
{
    node_t* n = (node_t*)TM_MALLOC(sizeof(*n));
    return n;
}


/* =============================================================================
 * rbtree_insert
 * -- Returns TRUE on success
 * =============================================================================
 */
bool_t
rbtree_insert (rbtree_t* r, void* key, void* val)
{
    node_t* node = getNode();
    node_t* ex = INSERT(r, key, val, node);
    if (ex != NULL) {
        releaseNode(node);
    }
    return ((ex == NULL) ? TRUE : FALSE);
}


/* =============================================================================
 * TMrbtree_insert
 * -- Returns TRUE on success
 * =============================================================================
 */
bool_t
TMrbtree_insert (TM_ARGDECL  rbtree_t* r, void* key, void* val)
{
    node_t* node = TMgetNode(TM_ARG_ALONE);
    node_t* ex = TX_INSERT(r, key, val, node);
    if (ex != NULL) {
        TMreleaseNode(TM_ARG  node);
    }
    return ((ex == NULL) ? TRUE : FALSE);
}


/* =============================================================================
 * rbtree_delete
 * -- Returns TRUE if key exists
 * =============================================================================
 */
bool_t
rbtree_delete (rbtree_t* r, void* key)
{
    node_t* node = NULL;
    node = LOOKUP(r, key);
    if (node != NULL) {
        node = DELETE(r, node);
    }
    if (node != NULL) {
        releaseNode(node);
    }
    return ((node != NULL) ? TRUE : FALSE);
}


/* =============================================================================
 * TMrbtree_delete
 * -- Returns TRUE if key exists
 * =============================================================================
 */
bool_t
TMrbtree_delete (TM_ARGDECL  rbtree_t* r, void* key)
{
    node_t* node = NULL;
    node = TX_LOOKUP(r, key);
    if (node != NULL) {
        node = TX_DELETE(r, node);
    }
    if (node != NULL) {
        TMreleaseNode(TM_ARG  node);
    }
    return ((node != NULL) ? TRUE : FALSE);
}


/* =============================================================================
 * rbtree_update
 * -- Return FALSE if had to insert node first
 * =============================================================================
 */
bool_t
rbtree_update (rbtree_t* r, void* key, void* val)
{
    node_t* nn = getNode();
    node_t* ex = INSERT(r, key, val, nn);
    if (ex != NULL) {
        STF(ex, v, val);
        releaseNode(nn);
        return TRUE;
    }
    return FALSE;
}


/* =============================================================================
 * TMrbtree_update
 * -- Return FALSE if had to insert node first
 * =============================================================================
 */
bool_t
TMrbtree_update (TM_ARGDECL  rbtree_t* r, void* key, void* val)
{
    node_t* nn = TMgetNode(TM_ARG_ALONE);
    node_t* ex = TX_INSERT(r, key, val, nn);
    if (ex != NULL) {
        TX_STF(ex, v, val);
        TMreleaseNode(TM_ARG  nn);
        return TRUE;
    }
    return FALSE;
}


/* =============================================================================
 * rbtree_get
 * =============================================================================
 */
void*
rbtree_get (rbtree_t* r, void* key) {
    node_t* n = LOOKUP(r, key);
    if (n != NULL) {
        void* val = LDF(n, v);
        return val;
    }
    return NULL;
}


/* =============================================================================
 * TMrbtree_get
 * =============================================================================
 */
void*
TMrbtree_get (TM_ARGDECL  rbtree_t* r, void* key) {
    node_t* n = TX_LOOKUP(r, key);
    if (n != NULL) {
        void* val = TX_LDF_P(n, v);
        return val;
    }
    return NULL;
}


/* =============================================================================
 * rbtree_contains
 * =============================================================================
 */
long
rbtree_contains (rbtree_t* r, void* key)
{
    node_t* n = LOOKUP(r, key);
    return (n != NULL);
}


/* =============================================================================
 * TMrbtree_contains
 * =============================================================================
 */
long
TMrbtree_contains (TM_ARGDECL  rbtree_t* r, void* key)
{
    node_t* n = TX_LOOKUP(r, key);
    return (n != NULL);
}


/* /////////////////////////////////////////////////////////////////////////////
 * TEST_RBTREE
 * /////////////////////////////////////////////////////////////////////////////
 */
#ifdef TEST_RBTREE


#include <assert.h>
#include <stdio.h>


static long
compare (const void* a, const void* b)
{
    return (*((const long*)a) - *((const long*)b));
}


static void
insertInt (rbtree_t* rbtreePtr, long* data)
{
    printf("Inserting: %li\n", *data);
    rbtree_insert(rbtreePtr, (void*)data, (void*)data);
    assert(*(long*)rbtree_get(rbtreePtr, (void*)data) == *data);
    assert(rbtree_verify(rbtreePtr, 0) > 0);
}


static void
removeInt (rbtree_t* rbtreePtr, long* data)
{
    printf("Removing: %li\n", *data);
    rbtree_delete(rbtreePtr, (void*)data);
    assert(rbtree_get(rbtreePtr, (void*)data) == NULL);
    assert(rbtree_verify(rbtreePtr, 0) > 0);
}


int
main ()
{
    long data[] = {3, 1, 4, 1, 5, 9, 2, 6, 5, 3, 5, 8, 9, 7};
    long numData = sizeof(data) / sizeof(data[0]);
    long i;

    puts("Starting...");

    rbtree_t* rbtreePtr = rbtree_alloc(&compare);
    assert(rbtreePtr);

    for (i = 0; i < numData; i++) {
        insertInt(rbtreePtr, &data[i]);
    }

    for (i = 0; i < numData; i++) {
        removeInt(rbtreePtr, &data[i]);
    }

    rbtree_free(rbtreePtr);

    puts("Done.");

    return 0;
}


#endif /* TEST_RBTREE */


/* =============================================================================
 *
 * End of rbtree.c
 *
 * =============================================================================
 */


================================================
FILE: stms/tinystm/test/intset/rbtree.h
================================================
/* =============================================================================
 *
 * rbtree.h
 * -- Red-black balanced binary search tree
 *
 * =============================================================================
 *
 * Copyright (C) Sun Microsystems Inc., 2006.  All Rights Reserved.
 * Authors: Dave Dice, Nir Shavit, Ori Shalev.
 *
 * STM: Transactional Locking for Disjoint Access Parallelism
 *
 * Transactional Locking II,
 * Dave Dice, Ori Shalev, Nir Shavit
 * DISC 2006, Sept 2006, Stockholm, Sweden.
 *
 * =============================================================================
 *
 * Modified by Chi Cao Minh
 *
 * =============================================================================
 *
 * For the license of bayes/sort.h and bayes/sort.c, please see the header
 * of the files.
 * 
 * ------------------------------------------------------------------------
 * 
 * For the license of kmeans, please see kmeans/LICENSE.kmeans
 * 
 * ------------------------------------------------------------------------
 * 
 * For the license of ssca2, please see ssca2/COPYRIGHT
 * 
 * ------------------------------------------------------------------------
 * 
 * For the license of lib/mt19937ar.c and lib/mt19937ar.h, please see the
 * header of the files.
 * 
 * ------------------------------------------------------------------------
 * 
 * For the license of lib/rbtree.h and lib/rbtree.c, please see
 * lib/LEGALNOTICE.rbtree and lib/LICENSE.rbtree
 * 
 * ------------------------------------------------------------------------
 * 
 * Unless otherwise noted, the following license applies to STAMP files:
 * 
 * Copyright (c) 2007, Stanford University
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met:
 * 
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 * 
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in
 *       the documentation and/or other materials provided with the
 *       distribution.
 * 
 *     * Neither the name of Stanford University nor the names of its
 *       contributors may be used to endorse or promote products derived
 *       from this software without specific prior written permission.
 * 
 * THIS SOFTWARE IS PROVIDED BY STANFORD UNIVERSITY ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL STANFORD UNIVERSITY BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 * THE POSSIBILITY OF SUCH DAMAGE.
 *
 * =============================================================================
 */


#ifndef RBTREE_H
#define RBTREE_H 1


#include "tm.h"
#include "types.h"


#ifdef __cplusplus
extern "C" {
#endif


typedef struct rbtree rbtree_t;


/* =============================================================================
 * rbtree_verify
 * =============================================================================
 */
long
rbtree_verify (rbtree_t* s, long verbose);


/* =============================================================================
 * rbtree_alloc
 * =============================================================================
 */
rbtree_t*
rbtree_alloc (long (*compare)(const void*, const void*));


/* =============================================================================
 * TMrbtree_alloc
 * =============================================================================
 */
rbtree_t*
TMrbtree_alloc (TM_ARGDECL  long (*compare)(const void*, const void*));


/* =============================================================================
 * rbtree_free
 * =============================================================================
 */
void
rbtree_free (rbtree_t* r);


/* =============================================================================
 * TMrbtree_free
 * =============================================================================
 */
void
TMrbtree_free (TM_ARGDECL  rbtree_t* r);


/* =============================================================================
 * rbtree_insert
 * -- Returns TRUE on success
 * =============================================================================
 */
bool_t
rbtree_insert (rbtree_t* r, void* key, void* val);


/* =============================================================================
 * TMrbtree_insert
 * -- Returns TRUE on success
 * =============================================================================
 */
TM_CALLABLE
bool_t
TMrbtree_insert (TM_ARGDECL  rbtree_t* r, void* key, void* val);


/* =============================================================================
 * rbtree_delete
 * =============================================================================
 */
bool_t
rbtree_delete (rbtree_t* r, void* key);


/* =============================================================================
 * TMrbtree_delete
 * =============================================================================
 */
TM_CALLABLE
bool_t
TMrbtree_delete (TM_ARGDECL  rbtree_t* r, void* key);


/* =============================================================================
 * rbtree_update
 * -- Return FALSE if had to insert node first
 * =============================================================================
 */
bool_t
rbtree_update (rbtree_t* r, void* key, void* val);


/* =============================================================================
 * TMrbtree_update
 * -- Return FALSE if had to insert node first
 * =============================================================================
 */
TM_CALLABLE
bool_t
TMrbtree_update (TM_ARGDECL  rbtree_t* r, void* key, void* val);


/* =============================================================================
 * rbtree_get
 * =============================================================================
 */
void*
rbtree_get (rbtree_t* r, void* key);


/* =============================================================================
 * TMrbtree_get
 * =============================================================================
 */
TM_CALLABLE
void*
TMrbtree_get (TM_ARGDECL  rbtree_t* r, void* key);


/* =============================================================================
 * rbtree_contains
 * =============================================================================
 */
bool_t
rbtree_contains (rbtree_t* r, void* key);


/* =============================================================================
 * TMrbtree_contains
 * =============================================================================
 */
TM_CALLABLE
bool_t
TMrbtree_contains (TM_ARGDECL  rbtree_t* r, void* key);


#define TMRBTREE_ALLOC()          TMrbtree_alloc(TM_ARG_ALONE)
#define TMRBTREE_FREE(r)          TMrbtree_free(TM_ARG  r)
#define TMRBTREE_INSERT(r, k, v)  TMrbtree_insert(TM_ARG  r, (void*)(k), (void*)(v))
#define TMRBTREE_DELETE(r, k)     TMrbtree_delete(TM_ARG  r, (void*)(k))
#define TMRBTREE_UPDATE(r, k, v)  TMrbtree_update(TM_ARG  r, (void*)(k), (void*)(v))
#define TMRBTREE_GET(r, k)        TMrbtree_get(TM_ARG  r, (void*)(k))
#define TMRBTREE_CONTAINS(r, k)   TMrbtree_contains(TM_ARG  r, (void*)(k))


#ifdef __cplusplus
}
#endif


#endif /* RBTREE_H */


/* =============================================================================
 *
 * End of rbtree.h
 *
 * =============================================================================
 */


================================================
FILE: stms/tinystm/test/intset/tm.h
================================================
/*
 * File:
 *   tm.h
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * Description:
 *   Empty file (to avoid source modifications red-black tree).
 *
 * Copyright (c) 2007-2014.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * This program has a dual license and can also be distributed
 * under the terms of the MIT license.
 */


================================================
FILE: stms/tinystm/test/intset/types.h
================================================
/* =============================================================================
 *
 * types.h
 * -- definitions of some types
 *
 * =============================================================================
 *
 * Copyright (C) Stanford University, 2006.  All Rights Reserved.
 * Author: Chi Cao Minh
 *
 * =============================================================================
 *
 * For the license of bayes/sort.h and bayes/sort.c, please see the header
 * of the files.
 * 
 * ------------------------------------------------------------------------
 * 
 * For the license of kmeans, please see kmeans/LICENSE.kmeans
 * 
 * ------------------------------------------------------------------------
 * 
 * For the license of ssca2, please see ssca2/COPYRIGHT
 * 
 * ------------------------------------------------------------------------
 * 
 * For the license of lib/mt19937ar.c and lib/mt19937ar.h, please see the
 * header of the files.
 * 
 * ------------------------------------------------------------------------
 * 
 * For the license of lib/rbtree.h and lib/rbtree.c, please see
 * lib/LEGALNOTICE.rbtree and lib/LICENSE.rbtree
 * 
 * ------------------------------------------------------------------------
 * 
 * Unless otherwise noted, the following license applies to STAMP files:
 * 
 * Copyright (c) 2007, Stanford University
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met:
 * 
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 * 
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in
 *       the documentation and/or other materials provided with the
 *       distribution.
 * 
 *     * Neither the name of Stanford University nor the names of its
 *       contributors may be used to endorse or promote products derived
 *       from this software without specific prior written permission.
 * 
 * THIS SOFTWARE IS PROVIDED BY STANFORD UNIVERSITY ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL STANFORD UNIVERSITY BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 * THE POSSIBILITY OF SUCH DAMAGE.
 *
 * =============================================================================
 */


#ifndef TYPES_H
#define TYPES_H 1


#ifdef __cplusplus
extern "C" {
#endif


#ifdef SIMULATOR
#  undef TRUE
#  undef FALSE
#  undef bool
#endif


typedef unsigned long ulong_t;

enum {
    FALSE = 0,
    TRUE  = 1
};

typedef long bool_t;


#ifdef __cplusplus
}
#endif


#endif /* TYPES_H */


/* =============================================================================
 *
 * End of types.h
 *
 * =============================================================================
 */


================================================
FILE: stms/tinystm/test/regression/.gitignore
================================================
irrevocability
types


================================================
FILE: stms/tinystm/test/regression/Makefile
================================================
ROOT = ../..

include $(ROOT)/Makefile.common

BINS = types irrevocability

.PHONY:	all clean

all:	$(BINS)

%.o:	%.c
	$(CC) $(CPPFLAGS) $(CFLAGS) $(DEFINES) -c -o $@ $<

$(BINS):	%:	%.o $(TMLIB)
	$(CC) -o $@ $< $(LDFLAGS)

clean:
	rm -f $(BINS) *.o


================================================
FILE: stms/tinystm/test/regression/irrevocability.c
================================================
/*
 * File:
 *   irrevocability.c
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * Description:
 *   Regression test for irrevocability.
 *
 * Copyright (c) 2007-2014.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * This program has a dual license and can also be distributed
 * under the terms of the MIT license.
 */

#ifdef NDEBUG
# undef NDEBUG
#endif

#include <assert.h>
#include <getopt.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>

#include "stm.h"
#include "wrappers.h"

#define DEFAULT_DURATION                5000
#define DEFAULT_IRREVOCABLE_PERCENT     25
#define DEFAULT_NB_THREADS              4

#define NB_ELEMENTS                     64
#define NB_SHUFFLES                     16

#define XSTR(s)                         STR(s)
#define STR(s)                          #s

static volatile int stop;

long data[64];

volatile long nb_irrevocable_serial = 0;
volatile long nb_irrevocable_parallel = 0;

typedef struct thread_data {
  unsigned long nb_aborts;
  unsigned long nb_aborts_1;
  unsigned long nb_aborts_2;
  unsigned long nb_aborts_locked_read;
  unsigned long nb_aborts_locked_write;
  unsigned long nb_aborts_validate_read;
  unsigned long nb_aborts_validate_write;
  unsigned long nb_aborts_validate_commit;
  unsigned long nb_aborts_invalid_memory;
  unsigned long nb_aborts_killed;
  unsigned long locked_reads_ok;
  unsigned long locked_reads_failed;
  unsigned long max_retries;
  unsigned short seed[3];
  int irrevocable_percent;
  char padding[64];
} thread_data_t;

static void *test(void *v)
{
  unsigned int seed;
  int i, n, irrevocable, serial, path;
  long l;
  sigjmp_buf *e;
  thread_data_t *d = (thread_data_t *)v;

  seed = (unsigned int)time(NULL);
  stm_init_thread();
  while (stop == 0) {
    irrevocable = (rand_r(&seed) < RAND_MAX / 100 * d->irrevocable_percent ? 1 : 0);
    serial = (rand_r(&seed) < RAND_MAX / 2 ? 1 : 0);
//    irrevocable = 1;
//    serial = 1;
    e = stm_start((stm_tx_attr_t)0);
    path = sigsetjmp(*e, 0);
    if (irrevocable == 4) {
      /* Aborted while in irrevocable mode => error */
      fprintf(stderr, "ERROR: aborted while in irrevocable mode\n");
      exit(1);
    }
    if (path & STM_PATH_UNINSTRUMENTED) {
      for (n = rand_r(&seed) % NB_SHUFFLES; n > 0; n--) {
        i = rand_r(&seed) % NB_ELEMENTS;
        data[i] = data[i] + 1;
        i = rand_r(&seed) % NB_ELEMENTS;
        data[i] = data[i] - 1;
      }
    } else {
      for (n = rand_r(&seed) % NB_SHUFFLES; n > 0; n--) {
        i = rand_r(&seed) % NB_ELEMENTS;
        stm_store_long(&data[i], stm_load_long(&data[i]) + 1);
        i = rand_r(&seed) % NB_ELEMENTS;
        stm_store_long(&data[i], stm_load_long(&data[i]) - 1);
      }
    }
    if (irrevocable) {
      if (irrevocable == 3) {
        /* Already tried entering irrevocable mode once => error */
        fprintf(stderr, "ERROR: failed entering irrevocable mode upon retry\n");
        exit(1);
      }
      irrevocable++;
      if (!stm_set_irrevocable(serial)) {
        fprintf(stderr, "ERROR: cannot enter irrevocable mode\n");
        exit(1);
      }
      irrevocable = 4;
      /* Once in irrevocable mode, we cannot abort */
      if (path & STM_PATH_UNINSTRUMENTED) {
        /* No other transaction can execute concurrently */
        for (i = 0, l = 0; i < NB_ELEMENTS; i++)
          l += data[i];
        assert(l == 0);
        for (i = 0; i < NB_ELEMENTS; i++)
          data[i] = 0;
        nb_irrevocable_serial++;
      } else {
        /* Non-conflicting transactions can execute concurrently */
        for (i = 0, l = 0; i < NB_ELEMENTS; i++)
          l += stm_load_long(&data[i]);
        assert(l == 0);
        for (i = 0; i < NB_ELEMENTS; i++)
          stm_store_long(&data[i], 0);
        nb_irrevocable_parallel++;
      }
    }
    if (path & STM_PATH_UNINSTRUMENTED) {
      for (i = 0, l = 0; i < NB_ELEMENTS; i++)
        l += data[i];
    } else {
      for (i = 0, l = 0; i < NB_ELEMENTS; i++)
        l += stm_load_long(&data[i]);
    }
    assert(l == 0);
    stm_commit();
  }

  stm_get_stats("nb_aborts", &d->nb_aborts);
  stm_get_stats("nb_aborts_1", &d->nb_aborts_1);
  stm_get_stats("nb_aborts_2", &d->nb_aborts_2);
  stm_get_stats("nb_aborts_locked_read", &d->nb_aborts_locked_read);
  stm_get_stats("nb_aborts_locked_write", &d->nb_aborts_locked_write);
  stm_get_stats("nb_aborts_validate_read", &d->nb_aborts_validate_read);
  stm_get_stats("nb_aborts_validate_write", &d->nb_aborts_validate_write);
  stm_get_stats("nb_aborts_validate_commit", &d->nb_aborts_validate_commit);
  stm_get_stats("nb_aborts_invalid_memory", &d->nb_aborts_invalid_memory);
  stm_get_stats("nb_aborts_killed", &d->nb_aborts_killed);
  stm_get_stats("locked_reads_ok", &d->locked_reads_ok);
  stm_get_stats("locked_reads_failed", &d->locked_reads_failed);
  stm_get_stats("max_retries", &d->max_retries);

  stm_exit_thread();

  return NULL;
}

int main(int argc, char **argv)
{
  struct option long_options[] = {
    // These options don't set a flag
    {"help",                      no_argument,       NULL, 'h'},
    {"contention-manager",        required_argument, NULL, 'c'},
    {"duration",                  required_argument, NULL, 'd'},
    {"irrevocable-percent",       required_argument, NULL, 'i'},
    {"num-threads",               required_argument, NULL, 'n'},
    {NULL, 0, NULL, 0}
  };

  int i, c;
  unsigned long aborts, aborts_1, aborts_2,
    aborts_locked_read, aborts_locked_write,
    aborts_validate_read, aborts_validate_write, aborts_validate_commit,
    aborts_invalid_memory, aborts_killed,
    locked_reads_ok, locked_reads_failed, max_retries;
  thread_data_t *td;
  pthread_t *threads;
  pthread_attr_t attr;
  struct timespec timeout;
  int duration = DEFAULT_DURATION;
  int irrevocable_percent = DEFAULT_IRREVOCABLE_PERCENT;
  int nb_threads = DEFAULT_NB_THREADS;
  char *cm = NULL;

  while(1) {
    i = 0;
    c = getopt_long(argc, argv, "hc:d:i:n:", long_options, &i);

    if(c == -1)
      break;

    if(c == 0 && long_options[i].flag == 0)
      c = long_options[i].val;

    switch(c) {
     case 0:
       /* Flag is automatically set */
       break;
     case 'h':
       printf("irrevocability -- STM stress test "
              "\n"
              "Usage:\n"
              "  irrevocability [options...]\n"
              "\n"
              "Options:\n"
              "  -h, --help\n"
              "        Print this message\n"
              "  -c, --contention-manager <string>\n"
              "        Contention manager for resolving conflicts (default=suicide)\n"
              "  -d, --duration <int>\n"
              "        Test duration in milliseconds (0=infinite, default=" XSTR(DEFAULT_DURATION) ")\n"
              "  -i, --irrevocable-percent <int>\n"
              "         (default=" XSTR(DEFAULT_IRREVOCABLE_PERCENT) ")\n"
              "  -n, --num-threads <int>\n"
              "        Number of threads (default=" XSTR(DEFAULT_NB_THREADS) ")\n"
         );
       exit(0);
     case 'c':
       cm = optarg;
       break;
     case 'd':
       duration = atoi(optarg);
       break;
     case 'n':
       nb_threads = atoi(optarg);
       break;
     case 'i':
       irrevocable_percent = atoi(optarg);
       break;
     case '?':
       printf("Use -h or --help for help\n");
       exit(0);
     default:
       exit(1);
    }
  }

  assert(duration >= 0);
  assert(nb_threads > 0);
  assert(irrevocable_percent >= 0 && irrevocable_percent <= 100);

  printf("CM           : %s\n", (cm == NULL ? "DEFAULT" : cm));
  printf("Duration     : %d\n", duration);
  printf("Irrevocable  : %d%%\n", irrevocable_percent);
  printf("Nb threads   : %d\n", nb_threads);

  for (i = 0; i < NB_ELEMENTS; i++)
    data[i] = 0;

  /* Init STM */
  printf("Initializing STM\n");
  stm_init();

  /* Set contention manager */
  if (cm != NULL) {
    if (stm_set_parameter("cm_policy", cm) == 0)
      printf("WARNING: cannot set contention manager \"%s\"\n", cm);
  }

  printf("int/long/ptr/word size: %d/%d/%d/%d\n",
         (int)sizeof(int),
         (int)sizeof(long),
         (int)sizeof(void *),
         (int)sizeof(stm_word_t));

  stop = 0;

  printf("TESTING CONCURRENT UPDATES...\n");

  timeout.tv_sec = duration / 1000;
  timeout.tv_nsec = (duration % 1000) * 1000000;

  if ((td = (thread_data_t *)malloc(nb_threads * sizeof(thread_data_t))) == NULL) {
    perror("malloc");
    exit(1);
  }
  if ((threads = (pthread_t *)malloc(nb_threads * sizeof(pthread_t))) == NULL) {
    perror("malloc");
    exit(1);
  }
  pthread_attr_init(&attr);
  pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
  for (i = 0; i < nb_threads; i++) {
    td[i].nb_aborts = 0;
    td[i].nb_aborts_1 = 0;
    td[i].nb_aborts_2 = 0;
    td[i].nb_aborts_locked_read = 0;
    td[i].nb_aborts_locked_write = 0;
    td[i].nb_aborts_validate_read = 0;
    td[i].nb_aborts_validate_write = 0;
    td[i].nb_aborts_validate_commit = 0;
    td[i].nb_aborts_invalid_memory = 0;
    td[i].nb_aborts_killed = 0;
    td[i].locked_reads_ok = 0;
    td[i].locked_reads_failed = 0;
    td[i].max_retries = 0;
    td[i].irrevocable_percent = irrevocable_percent;
    if (pthread_create(&threads[i], &attr, test, (void *)(&td[i])) != 0) {
      fprintf(stderr, "Error creating thread\n");
      exit(1);
    }
  }
  pthread_attr_destroy(&attr);
  nanosleep(&timeout, NULL);
  printf("STOPPING...\n");
  stop = 1;
  for (i = 0; i < nb_threads; i++) {
    if (pthread_join(threads[i], NULL) != 0) {
      fprintf(stderr, "Error waiting for thread completion\n");
      exit(1);
    }
  }

  printf("PASSED\n");
  printf("Number of successful irrevocable-serial executions   : %ld\n", nb_irrevocable_serial);
  printf("Number of successful irrevocable-parallel executions : %ld\n", nb_irrevocable_parallel);

  aborts = 0;
  aborts_1 = 0;
  aborts_2 = 0;
  aborts_locked_read = 0;
  aborts_locked_write = 0;
  aborts_validate_read = 0;
  aborts_validate_write = 0;
  aborts_validate_commit = 0;
  aborts_invalid_memory = 0;
  aborts_killed = 0;
  locked_reads_ok = 0;
  locked_reads_failed = 0;
  max_retries = 0;
  for (i = 0; i < nb_threads; i++) {
    printf("Thread %d\n", i);
    printf("  #aborts     : %lu\n", td[i].nb_aborts);
    printf("    #lock-r   : %lu\n", td[i].nb_aborts_locked_read);
    printf("    #lock-w   : %lu\n", td[i].nb_aborts_locked_write);
    printf("    #val-r    : %lu\n", td[i].nb_aborts_validate_read);
    printf("    #val-w    : %lu\n", td[i].nb_aborts_validate_write);
    printf("    #val-c    : %lu\n", td[i].nb_aborts_validate_commit);
    printf("    #inv-mem  : %lu\n", td[i].nb_aborts_invalid_memory);
    printf("    #killed   : %lu\n", td[i].nb_aborts_killed);
    printf("  #aborts>=1  : %lu\n", td[i].nb_aborts_1);
    printf("  #aborts>=2  : %lu\n", td[i].nb_aborts_2);
    printf("  #lr-ok      : %lu\n", td[i].locked_reads_ok);
    printf("  #lr-failed  : %lu\n", td[i].locked_reads_failed);
    printf("  Max retries : %lu\n", td[i].max_retries);
    aborts += td[i].nb_aborts;
    aborts_1 += td[i].nb_aborts_1;
    aborts_2 += td[i].nb_aborts_2;
    aborts_locked_read += td[i].nb_aborts_locked_read;
    aborts_locked_write += td[i].nb_aborts_locked_write;
    aborts_validate_read += td[i].nb_aborts_validate_read;
    aborts_validate_write += td[i].nb_aborts_validate_write;
    aborts_validate_commit += td[i].nb_aborts_validate_commit;
    aborts_invalid_memory += td[i].nb_aborts_invalid_memory;
    aborts_killed += td[i].nb_aborts_killed;
    locked_reads_ok += td[i].locked_reads_ok;
    locked_reads_failed += td[i].locked_reads_failed;
    if (max_retries < td[i].max_retries)
      max_retries = td[i].max_retries;
  }
  printf("Duration      : %d (ms)\n", duration);
  printf("#aborts       : %lu (%f / s)\n", aborts, aborts * 1000.0 / duration);
  printf("  #lock-r     : %lu (%f / s)\n", aborts_locked_read, aborts_locked_read * 1000.0 / duration);
  printf("  #lock-w     : %lu (%f / s)\n", aborts_locked_write, aborts_locked_write * 1000.0 / duration);
  printf("  #val-r      : %lu (%f / s)\n", aborts_validate_read, aborts_validate_read * 1000.0 / duration);
  printf("  #val-w      : %lu (%f / s)\n", aborts_validate_write, aborts_validate_write * 1000.0 / duration);
  printf("  #val-c      : %lu (%f / s)\n", aborts_validate_commit, aborts_validate_commit * 1000.0 / duration);
  printf("  #inv-mem    : %lu (%f / s)\n", aborts_invalid_memory, aborts_invalid_memory * 1000.0 / duration);
  printf("  #killed     : %lu (%f / s)\n", aborts_killed, aborts_killed * 1000.0 / duration);
  printf("#aborts>=1    : %lu (%f / s)\n", aborts_1, aborts_1 * 1000.0 / duration);
  printf("#aborts>=2    : %lu (%f / s)\n", aborts_2, aborts_2 * 1000.0 / duration);
  printf("#lr-ok        : %lu (%f / s)\n", locked_reads_ok, locked_reads_ok * 1000.0 / duration);
  printf("#lr-failed    : %lu (%f / s)\n", locked_reads_failed, locked_reads_failed * 1000.0 / duration);
  printf("Max retries   : %lu\n", max_retries);

  /* Cleanup STM */
  stm_exit();

  return 0;
}


================================================
FILE: stms/tinystm/test/regression/perf.c
================================================
/*
 * File:
 *   perf.c
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * Description:
 *   Performance regression test.
 *
 * Copyright (c) 2007-2014.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * This program has a dual license and can also be distributed
 * under the terms of the MIT license.
 */

#include <stdlib.h>
#include <stdio.h>

#include "stm.h"
#include "mod_mem.h"

/* Increment the value of the global clock (used for timestamps).
 * Hidden to tinySTM users. */
void stm_inc_clock(void);

__attribute__((aligned(64)))
stm_word_t global_ctr[1000] = {0};

#define MEASURE_NB 1000

static inline uint64_t
rdtsc(void)
{
  uint32_t a, d;
  asm volatile( "rdtsc\n\t" : "=a" (a), "=d" (d));
  return (((uint64_t)d) << 32) | (((uint64_t)a) & 0xffffffff);
}

static int compar(const void *a, const void *b)
{
  return *((uint64_t *)a) - *((uint64_t *)b);
}

static void remove_cst_cost(uint64_t *m, size_t size, uint64_t cost)
{
  size_t i;
  for (i = 0; i < size; i++) {
    m[i] -= cost;
  }
}

static void stats(uint64_t *m, size_t size, uint64_t *min, double *avg, uint64_t *median)
{
  size_t i;
  /* Find median value */
  qsort(m, size, sizeof(uint64_t), compar);
  *median = m[(size/2)-1];
  /* Find minimal and calculate average */
  *min = ~0UL;
  *avg = 0.0;
  for (i = 0; i < size; i++) {
    *avg += m[i];
    if (m[i] < *min)
      *min = m[i];
  }
  *avg = *avg / size;
}

static void test1load(int ro)
{
  uint64_t m_s[MEASURE_NB];
  uint64_t m_r[MEASURE_NB];
  uint64_t m_c[MEASURE_NB];
  uint64_t m_rdtsc;
  uint64_t start;
  uint64_t min;
  double avg;
  uint64_t med;
  unsigned long i;
  stm_tx_attr_t _a = {{.read_only = ro}};

  m_rdtsc = ~0UL;
  for (i = 0; i < MEASURE_NB; i++) {
    start = rdtsc();
    start = rdtsc() - start;
    if (start < m_rdtsc)
      m_rdtsc = start;
  } 

  for (i = 0; i < MEASURE_NB; i++) {
    sigjmp_buf *_e;
    start = rdtsc();
    _e = stm_start(_a);
    m_s[i] = rdtsc() - start;
    sigsetjmp(*_e, 0); 
    stm_load(&global_ctr[0]);
    stm_inc_clock();
    stm_commit();
  }

  for (i = 0; i < MEASURE_NB; i++) {
    sigjmp_buf *_e = stm_start(_a);
    sigsetjmp(*_e, 0); 
    start = rdtsc();
    stm_load(&global_ctr[0]);
    m_r[i] = rdtsc() - start;
    stm_inc_clock();
    stm_commit();
  }
  
  for (i = 0; i < MEASURE_NB; i++) {
    sigjmp_buf *_e = stm_start(_a);
    sigsetjmp(*_e, 0); 
    stm_load(&global_ctr[0]);
    stm_inc_clock();
    start = rdtsc();
    stm_commit();
    m_c[i] = rdtsc() - start;
  }
 
  remove_cst_cost(m_s, MEASURE_NB, m_rdtsc);
  remove_cst_cost(m_r, MEASURE_NB, m_rdtsc);
  remove_cst_cost(m_c, MEASURE_NB, m_rdtsc);

  if (ro) 
    printf("RO transaction - 1 load\n");
  else
    printf("RW transaction - 1 load\n");

  printf("%12s %12s %12s %12s\n", "", "min", "avg", "med");
  stats(m_s, MEASURE_NB, &min, &avg, &med); 
  printf("%12s %12lu %12.2f %12lu\n", "start", (unsigned long)min, avg, (unsigned long)med);
  stats(m_r, MEASURE_NB, &min, &avg, &med); 
  printf("%12s %12lu %12.2f %12lu\n", "load", (unsigned long)min, avg, (unsigned long)med);
  stats(m_c, MEASURE_NB, &min, &avg, &med); 
  printf("%12s %12lu %12.2f %12lu\n", "commit", (unsigned long)min, avg, (unsigned long)med);
}

static void testnload(int ro, size_t load_nb)
{
  uint64_t m_s[MEASURE_NB];
  uint64_t m_r[MEASURE_NB];
  uint64_t m_c[MEASURE_NB];
  uint64_t m_rdtsc;
  uint64_t start;
  uint64_t min;
  double avg;
  uint64_t med;
  unsigned long i;
  size_t j;
  stm_tx_attr_t _a = {{.read_only = ro}};

  m_rdtsc = ~0UL;
  for (i = 0; i < MEASURE_NB; i++) {
    start = rdtsc();
    start = rdtsc() - start;
    if (start < m_rdtsc)
      m_rdtsc = start;
  } 

  for (i = 0; i < MEASURE_NB; i++) {
    sigjmp_buf *_e;
    start = rdtsc();
    _e = stm_start(_a);
    m_s[i] = rdtsc() - start;
    sigsetjmp(*_e, 0); 
    for (j = 0; j < load_nb; j++)
      stm_load(&global_ctr[j]);
    stm_inc_clock();
    stm_commit();
  }

  for (i = 0; i < MEASURE_NB; i++) {
    sigjmp_buf *_e = stm_start(_a);
    sigsetjmp(*_e, 0); 
    start = rdtsc();
    for (j = 0; j < load_nb; j++)
      stm_load(&global_ctr[j]);
    m_r[i] = rdtsc() - start;
    stm_inc_clock();
    stm_commit();
  }
  
  for (i = 0; i < MEASURE_NB; i++) {
    sigjmp_buf *_e = stm_start(_a);
    sigsetjmp(*_e, 0); 
    for (j = 0; j < load_nb; j++)
      stm_load(&global_ctr[j]);
    stm_inc_clock();
    start = rdtsc();
    stm_commit();
    m_c[i] = rdtsc() - start;
  }
 
  remove_cst_cost(m_s, MEASURE_NB, m_rdtsc);
  remove_cst_cost(m_r, MEASURE_NB, m_rdtsc);
  remove_cst_cost(m_c, MEASURE_NB, m_rdtsc);

  if (ro) 
    printf("RO transaction - %lu load\n", (unsigned long)load_nb);
  else
    printf("RW transaction - %lu load\n", (unsigned long)load_nb);

  printf("%12s %12s %12s %12s\n", "", "min", "avg", "med");
  stats(m_s, MEASURE_NB, &min, &avg, &med); 
  printf("%12s %12lu %12.2f %12lu\n", "start", (unsigned long)min, avg, (unsigned long)med);
  stats(m_r, MEASURE_NB, &min, &avg, &med); 
  if (load_nb)
    printf("%12s %12lu %12.2f %12lu\n", "load", (unsigned long)min/load_nb, avg/load_nb, (unsigned long)med/load_nb);
  stats(m_c, MEASURE_NB, &min, &avg, &med); 
  printf("%12s %12lu %12.2f %12lu\n", "commit", (unsigned long)min, avg, (unsigned long)med);
}

static void testnloadnstore(size_t load_nb, size_t store_nb)
{
  uint64_t m_s[MEASURE_NB];
  uint64_t m_r[MEASURE_NB];
  uint64_t m_w[MEASURE_NB];
  uint64_t m_c[MEASURE_NB];
  uint64_t m_rdtsc;
  uint64_t start;
  uint64_t min;
  double avg;
  uint64_t med;
  unsigned long i;
  size_t j;
  stm_tx_attr_t _a = {{.read_only = 0}};

  m_rdtsc = ~0UL;
  for (i = 0; i < MEASURE_NB; i++) {
    start = rdtsc();
    start = rdtsc() - start;
    if (start < m_rdtsc)
      m_rdtsc = start;
  } 

  for (i = 0; i < MEASURE_NB; i++) {
    sigjmp_buf *_e;
    start = rdtsc();
    _e = stm_start(_a);
    m_s[i] = rdtsc() - start;
    sigsetjmp(*_e, 0); 
    for (j = 0; j < load_nb; j++)
      stm_load(&global_ctr[j]);
    for (j = 0; j < store_nb; j++)
      stm_store(&global_ctr[j], (stm_word_t)0);
    stm_inc_clock();
    stm_commit();
  }

  for (i = 0; i < MEASURE_NB; i++) {
    sigjmp_buf *_e = stm_start(_a);
    sigsetjmp(*_e, 0); 
    start = rdtsc();
    for (j = 0; j < load_nb; j++)
      stm_load(&global_ctr[j]);
    m_r[i] = rdtsc() - start;
    for (j = 0; j < store_nb; j++)
      stm_store(&global_ctr[j], (stm_word_t)0);
    stm_inc_clock();
    stm_commit();
  }
  
  for (i = 0; i < MEASURE_NB; i++) {
    sigjmp_buf *_e = stm_start(_a);
    sigsetjmp(*_e, 0); 
    for (j = 0; j < load_nb; j++)
      stm_load(&global_ctr[j]);
    start = rdtsc();
    for (j = 0; j < store_nb; j++)
      stm_store(&global_ctr[j], (stm_word_t)0);
    m_w[i] = rdtsc() - start;
    stm_inc_clock();
    stm_commit();
  }
  
  for (i = 0; i < MEASURE_NB; i++) {
    sigjmp_buf *_e = stm_start(_a);
    sigsetjmp(*_e, 0); 
    for (j = 0; j < load_nb; j++)
      stm_load(&global_ctr[j]);
    for (j = 0; j < store_nb; j++)
      stm_store(&global_ctr[j], (stm_word_t)0);
    stm_inc_clock();
    start = rdtsc();
    stm_commit();
    m_c[i] = rdtsc() - start;
  }
 
  remove_cst_cost(m_s, MEASURE_NB, m_rdtsc);
  remove_cst_cost(m_r, MEASURE_NB, m_rdtsc);
  remove_cst_cost(m_w, MEASURE_NB, m_rdtsc);
  remove_cst_cost(m_c, MEASURE_NB, m_rdtsc);

  printf("RW transaction - %lu load - %lu store\n", (unsigned long)load_nb, (unsigned long)store_nb);

  printf("%12s %12s %12s %12s\n", "", "min", "avg", "med");
  stats(m_s, MEASURE_NB, &min, &avg, &med); 
  printf("%12s %12lu %12.2f %12lu\n", "start", (unsigned long)min, avg, (unsigned long)med);
  stats(m_r, MEASURE_NB, &min, &avg, &med); 
  if (load_nb)
    printf("%12s %12lu %12.2f %12lu\n", "load", (unsigned long)min/load_nb, avg/load_nb, (unsigned long)med/load_nb);
  stats(m_w, MEASURE_NB, &min, &avg, &med); 
  if (store_nb)
    printf("%12s %12lu %12.2f %12lu\n", "store", (unsigned long)min/store_nb, avg/store_nb, (unsigned long)med/store_nb);
  stats(m_c, MEASURE_NB, &min, &avg, &med); 
  printf("%12s %12lu %12.2f %12lu\n", "commit", (unsigned long)min, avg, (unsigned long)med);
}

/* TODO
 *  Add clock perturbation to avoid fast commit
 *  Add write after write / load after write measurements
 *  Add stm_malloc/stm_free measurements
 */

int main(int argc, char **argv)
{
  /* Init STM */
  stm_init();
  mod_mem_init(0);
  /* Create transaction */
  stm_init_thread();

  /* Testing */
  test1load(1);
  test1load(0);
  testnload(1, 100);
  testnload(0, 100);
  testnloadnstore(100, 20);
  testnloadnstore(100, 20);

  /* Free transaction */
  stm_exit_thread();
  /* Cleanup STM */
  stm_exit();
  return 0;
}


================================================
FILE: stms/tinystm/test/regression/types.c
================================================
/*
 * File:
 *   types.c
 * Author(s):
 *   Pascal Felber <pascal.felber@unine.ch>
 *   Patrick Marlier <patrick.marlier@unine.ch>
 * Description:
 *   Regression test for various data types.
 *
 * Copyright (c) 2007-2014.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, version 2
 * of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * This program has a dual license and can also be distributed
 * under the terms of the MIT license.
 */

#ifdef NDEBUG
# undef NDEBUG
#endif

#include <assert.h>
#include <math.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>

#include "stm.h"
#include "wrappers.h"

union {
  uint8_t u8[256];
  uint16_t u16[128];
  uint32_t u32[64];
  uint64_t u64[32];
  int8_t s8[256];
  int16_t s16[128];
  int32_t s32[64];
  int64_t s64[32];
  float f[64];
  double d[32];
} tab, tab_ro;

typedef union {
  uint8_t u8;
  uint16_t u16;
  uint32_t u32;
  uint64_t u64;
  int8_t s8;
  int16_t s16;
  int32_t s32;
  int64_t s64;
  float f;
  double d;
  void *p;
} val_t;

enum {
  TYPE_UINT8,
  TYPE_UINT16,
  TYPE_UINT32,
  TYPE_UINT64,
  TYPE_CHAR,
  TYPE_UCHAR,
  TYPE_SHORT,
  TYPE_USHORT,
  TYPE_INT,
  TYPE_UINT,
  TYPE_LONG,
  TYPE_ULONG,
  TYPE_FLOAT,
  TYPE_DOUBLE,
  TYPE_BYTES
};

#define NB_THREADS                      4
#define DURATION                        5000

volatile int verbose;
volatile int stop;

static void compare(int idx, val_t val, int type, int size)
{
  int i;
  val_t v;

  switch(type) {
   case TYPE_UINT8:
     for (i = 0; i < 256 / sizeof(uint8_t); i++) {
       v.u8 = stm_load_u8(&tab.u8[i]);
       assert(i == idx ? v.u8 == val.u8 : v.u8 == tab_ro.u8[i]);
     }
     break;
   case TYPE_UINT16:
     for (i = 0; i < 256 / sizeof(uint16_t); i++) {
       v.u16 = stm_load_u16(&tab.u16[i]);
       assert(i == idx ? v.u16 == val.u16 : v.u16 == tab_ro.u16[i]);
     }
     break;
   case TYPE_UINT32:
     for (i = 0; i < 256 / sizeof(uint32_t); i++) {
       v.u32 = stm_load_u32(&tab.u32[i]);
       assert(i == idx ? v.u32 == val.u32 : v.u32 == tab_ro.u32[i]);
     }
     break;
   case TYPE_UINT64:
     for (i = 0; i < 256 / sizeof(uint64_t); i++) {
       v.u64 = stm_load_u64(&tab.u64[i]);
       assert(i == idx ? v.u64 == val.u64 : v.u64 == tab_ro.u64[i]);
     }
     break;
   case TYPE_CHAR:
     for (i = 0; i < 256 / sizeof(unsigned char); i++) {
       v.s8 = (int8_t)stm_load_char((char *)&tab.s8[i]);
       assert(i == idx ? v.s8 == val.s8 : v.s8 == tab_ro.s8[i]);
     }
     break;
   case TYPE_UCHAR:
     for (i = 0; i < 256 / sizeof(char); i++) {
       v.u8 = (uint8_t)stm_load_uchar((unsigned char *)&tab.u8[i]);
       assert(i == idx ? v.u8 == val.u8 : v.u8 == tab_ro.u8[i]);
     }
     break;
   case TYPE_SHORT:
     for (i = 0; i < 256 / sizeof(short); i++) {
       v.s16 = (int16_t)stm_load_short((short *)&tab.s16[i]);
       assert(i == idx ? v.s16 == val.s16 : v.s16 == tab_ro.s16[i]);
     }
     break;
   case TYPE_USHORT:
     for (i = 0; i < 256 / sizeof(unsigned short); i++) {
       v.u16 = (uint16_t)stm_load_ushort((unsigned short *)&tab.u16[i]);
       assert(i == idx ? v.u16 == val.u16 : v.u16 == tab_ro.u16[i]);
     }
     break;
   case TYPE_INT:
     for (i = 0; i < 256 / sizeof(int); i++) {
       v.s32 = (int32_t)stm_load_int((int *)&tab.s32[i]);
       assert(i == idx ? v.s32 == val.s32 : v.s32 == tab_ro.s32[i]);
     }
     break;
   case TYPE_UINT:
     for (i = 0; i < 256 / sizeof(unsigned int); i++) {
       v.u32 = (uint32_t)stm_load_uint((unsigned int *)&tab.u32[i]);
       assert(i == idx ? v.u32 == val.u32 : v.u32 == tab_ro.u32[i]);
     }
     break;
   case TYPE_LONG:
     for (i = 0; i < 256 / sizeof(long); i++) {
       if (sizeof(long) == 4) {
         v.s32 = (int32_t)stm_load_long((long *)&tab.s32[i]);
         assert(i == idx ? v.s32 == val.s32 : v.s32 == tab_ro.s32[i]);
       } else {
         v.s64 = (int64_t)stm_load_long((long *)&tab.s64[i]);
         assert(i == idx ? v.s64 == val.s64 : v.s64 == tab_ro.s64[i]);
       }
     }
     break;
   case TYPE_ULONG:
     for (i = 0; i < 256 / sizeof(unsigned long); i++) {
       if (sizeof(long) == 4) {
         v.u32 = (uint32_t)stm_load_ulong((unsigned long *)&tab.u32[i]);
         assert(i == idx ? v.u32 == val.u32 : v.u32 == tab_ro.u32[i]);
       } else {
         v.u64 = (uint64_t)stm_load_ulong((unsigned long *)&tab.u64[i]);
         assert(i == idx ? v.u64 == val.u64 : v.u64 == tab_ro.u64[i]);
       }
     }
     break;
   case TYPE_FLOAT:
     for (i = 0; i < 256 / sizeof(float); i++) {
       v.f = stm_load_float(&tab.f[i]);
       assert(i == idx ? (isnan(v.f) && isnan(val.f)) || v.f == val.f : (isnan(v.f) && isnan(tab_ro.f[i])) || v.f == tab_ro.f[i]);
     }
     break;
   case TYPE_DOUBLE:
     for (i = 0; i < 256 / sizeof(double); i++) {
       v.d = stm_load_double(&tab.d[i]);
       assert(i == idx ? (isnan(v.d) && isnan(val.d)) || v.d == val.d : (isnan(v.d) && isnan(tab_ro.d[i])) || v.d == tab_ro.d[i]);
     }
     break;
   case TYPE_BYTES:
     for (i = 0; i < 256 / sizeof(uint8_t); i++) {
       v.u8 = stm_load_u8(&tab.u8[i]);
       assert(i >= idx && i < idx + size ? v.u8 == ((uint8_t *)val.p)[i - idx] : v.u8 == tab_ro.u8[i]);
     }
     break;
  }
}

static void test_loads()
{
  int i, j;
  val_t val;
  sigjmp_buf *e;

  e = stm_start((stm_tx_attr_t)0);
  if (e != NULL)
    sigsetjmp(*e, 0);

  if (verbose)
    printf("- Testing uint8_t\n");
  for (i = 0; i < 256 / sizeof(uint8_t); i++) {
    val.u8 = stm_load_u8(&tab.u8[i]);
    assert(val.u8 == tab_ro.u8[i]);
  }
  if (verbose)
    printf("- Testing uint16_t\n");
  for (i = 0; i < 256 / sizeof(uint16_t); i++) {
    val.u16 = stm_load_u16(&tab.u16[i]);
    assert(val.u16 == tab_ro.u16[i]);
  }
  if (verbose)
    printf("- Testing misaligned uint16_t\n");
  for (i = 1; i < 256 - sizeof(uint16_t); i += sizeof(uint16_t)) {
    val.u16 = stm_load_u16((uint16_t *)&tab.u8[i]);
    assert(val.u16 == *(uint16_t *)&tab_ro.u8[i]);
  }
  if (verbose)
    printf("- Testing uint32_t\n");
  for (i = 0; i < 256 / sizeof(uint32_t); i++) {
    val.u32 = stm_load_u32(&tab.u32[i]);
    assert(val.u32 == tab_ro.u32[i]);
  }
  if (verbose)
    printf("- Testing misaligned uint32_t\n");
  for (j = 1; j < sizeof(uint32_t); j++) {
    for (i = j; i < 256 - sizeof(uint32_t); i += sizeof(uint32_t)) {
      val.u32 = stm_load_u32((uint32_t *)&tab.u8[i]);
      assert(val.u32 == *(uint32_t *)&tab_ro.u8[i]);
    }
  }
  if (verbose)
    printf("- Testing uint64_t\n");
  for (i = 0; i < 256 / sizeof(uint64_t); i++) {
    val.u64 = stm_load_u64(&tab.u64[i]);
    assert(val.u64 == tab_ro.u64[i]);
  }
  if (verbose)
    printf("- Testing misaligned uint64_t\n");
  for (j = 1; j < sizeof(uint64_t); j++) {
    for (i = j; i < 256 - sizeof(uint64_t); i += sizeof(uint64_t)) {
      val.u64 = stm_load_u64((uint64_t *)&tab.u8[i]);
      assert(val.u64 == *(uint64_t *)&tab_ro.u8[i]);
    }
  }
  if (verbose)
    printf("- Testing char\n");
  for (i = 0; i < 256 / sizeof(char); i++) {
    val.s8 = (int8_t)stm_load_char((volatile char *)&tab.s8[i]);
    assert(val.s8 == tab_ro.s8[i]);
  }
  if (verbose)
    printf("- Testing unsigned char\n");
  for (i = 0; i < 256 / sizeof(unsigned char); i++) {
    val.u8 = (uint8_t)stm_load_uchar((volatile unsigned char *)&tab.u8[i]);
    assert(val.u8 == tab_ro.u8[i]);
  }
  if (verbose)
    printf("- Testing short\n");
  for (i = 0; i < 256 / sizeof(short); i++) {
    val.s16 = (int16_t)stm_load_short((volatile short *)&tab.s16[i]);
    assert(val.s16 == tab_ro.s16[i]);
  }
  if (verbose)
    printf("- Testing unsigned short\n");
  for (i = 0; i < 256 / sizeof(unsigned short); i++) {
    val.u16 = (uint16_t)stm_load_ushort((volatile unsigned short *)&tab.u16[i]);
    assert(val.u16 == tab_ro.u16[i]);
  }
  if (verbose)
    printf("- Testing int\n");
  for (i = 0; i < 256 / sizeof(int); i++) {
    val.s32 = (int32_t)stm_load_int((volatile int *)&tab.s32[i]);
    assert(val.s32 == tab_ro.s32[i]);
  }
  if (verbose)
    printf("- Testing unsigned int\n");
  for (i = 0; i < 256 / sizeof(unsigned int); i++) {
    val.u32 = (uint32_t)stm_load_uint((volatile unsigned int *)&tab.u32[i]);
    assert(val.u32 == tab_ro.u32[i]);
  }
  if (verbose)
    printf("- Testing long\n");
  for (i = 0; i < 256 / sizeof(long); i++) {
    if (sizeof(long) == 4) {
      val.s32 = (int32_t)stm_load_long((volatile long *)&tab.s32[i]);
      assert(val.s32 == tab_ro.s32[i]);
    } else {
      val.s64 = (int64_t)stm_load_long((volatile long *)&tab.s64[i]);
      assert(val.s64 == tab_ro.s64[i]);
    }
  }
  if (verbose)
    printf("- Testing unsigned long\n");
  for (i = 0; i < 256 / sizeof(unsigned long); i++) {
    if (sizeof(long) == 4) {
      val.u32 = (uint32_t)stm_load_ulong((volatile unsigned long *)&tab.u32[i]);
      assert(val.u32 == tab_ro.u32[i]);
    } else {
      val.u64 = (uint64_t)stm_load_ulong((volatile unsigned long *)&tab.u64[i]);
      assert(val.u64 == tab_ro.u64[i]);
    }
  }
  if (verbose)
    printf("- Testing float\n");
  for (i = 0; i < 256 / sizeof(float); i++) {
    val.f = stm_load_float(&tab.f[i]);
    assert((isnan(val.f) && isnan(tab_ro.f[i])) || val.f == tab_ro.f[i]);
  }
  if (verbose)
    printf("- Testing double\n");
  for (i = 0; i < 256 / sizeof(double); i++) {
    val.d = stm_load_double(&tab.d[i]);
    assert((isnan(val.d) && isnan(tab_ro.d[i])) || val.d == tab_ro.d[i]);
  }

  stm_commit();
}

static void test_stores()
{
  int i, j;
  val_t val, bytes;
  sigjmp_buf *e;

  e = stm_start((stm_tx_attr_t)0);
  if (e != NULL)
    sigsetjmp(*e, 0);

  if (verbose)
    printf("- Testing uint8_t\n");
  for (i = 0; i < 256; i++) {
    val.u8 = ~tab_ro.u8[i];
    stm_store_u8(&tab.u8[i], val.u8);
    compare(i, val, TYPE_UINT8, 0);
    stm_store_u8(&tab.u8[i], tab_ro.u8[i]);
  }
  if (verbose)
    printf("- Testing uint16_t\n");
  for (i = 0; i < 256 / sizeof(uint16_t); i++) {
    val.u16 = ~tab_ro.u16[i];
    stm_store_u16(&tab.u16[i], val.u16);
    compare(i, val, TYPE_UINT16, 0);
    stm_store_u16(&tab.u16[i], tab_ro.u16[i]);
  }
  if (verbose)
    printf("- Testing misaligned uint16_t\n");
  for (i = 1; i < 256 - sizeof(uint16_t); i += sizeof(uint16_t)) {
    val.u16 = ~*(uint16_t *)&tab_ro.u8[i];
    stm_store_u16((uint16_t *)&tab.u8[i], val.u16);
    bytes.p = &val.u16;
    compare(i, bytes, TYPE_BYTES, sizeof(uint16_t));
    stm_store_u16((uint16_t *)&tab.u8[i], *(uint16_t *)&tab_ro.u8[i]);
  }
  if (verbose)
    printf("- Testing uint32_t\n");
  for (i = 0; i < 256 / sizeof(uint32_t); i++) {
    val.u32 = ~tab_ro.u32[i];
    stm_store_u32(&tab.u32[i], val.u32);
    compare(i, val, TYPE_UINT32, 0);
    stm_store_u32(&tab.u32[i], tab_ro.u32[i]);
  }
  if (verbose)
    printf("- Testing misaligned uint32_t\n");
  for (j = 1; j < sizeof(uint32_t); j++) {
    for (i = j; i < 256 - sizeof(uint32_t); i += sizeof(uint32_t)) {
      val.u32 = ~*(uint32_t *)&tab_ro.u8[i];
      stm_store_u32((uint32_t *)&tab.u8[i], val.u32);
      bytes.p = &val.u32;
      compare(i, bytes, TYPE_BYTES, sizeof(uint32_t));
      stm_store_u32((uint32_t *)&tab.u8[i], *(uint32_t *)&tab_ro.u8[i]);
    }
  }
  if (verbose)
    printf("- Testing uint64_t\n");
  for (i = 0; i < 256 / sizeof(uint64_t); i++) {
    val.u64 = ~tab_ro.u64[i];
    stm_store_u64(&tab.u64[i], val.u64);
    compare(i, val, TYPE_UINT64, 0);
    stm_store_u64(&tab.u64[i], tab_ro.u64[i]);
  }
  if (verbose)
    printf("- Testing misaligned uint64_t\n");
  for (j = 1; j < sizeof(uint64_t); j++) {
    for (i = j; i < 256 - sizeof(uint32_t); i += sizeof(uint32_t)) {
      val.u32 = ~*(uint32_t *)&tab_ro.u8[i];
      stm_store_u32((uint32_t *)&tab.u8[i], val.u32);
      bytes.p = &val.u32;
      compare(i, bytes, TYPE_BYTES, sizeof(uint32_t));
      stm_store_u32((uint32_t *)&tab.u8[i], *(uint32_t *)&tab_ro.u8[i]);
    }
  }
  if (verbose)
    printf("- Testing char\n");
  for (i = 0; i < 256 / sizeof(char); i++) {
    val.s8 = ~tab_ro.s8[i];
    stm_store_char((volatile char *)&tab.s8[i], (char)val.s8);
    compare(i, val, TYPE_CHAR, 0);
    stm_store_char((volatile char *)&tab.s8[i], (char)tab_ro.s8[i]);
  }
  if (verbose)
    printf("- Testing unsigned char\n");
  for (i = 0; i < 256 / sizeof(unsigned char); i++) {
    val.u8 = ~tab_ro.u8[i];
    stm_store_uchar((volatile unsigned char *)&tab.u8[i], (unsigned char)val.u8);
    compare(i, val, TYPE_UCHAR, 0);
    stm_store_uchar((volatile unsigned char *)&tab.u8[i], (unsigned char)tab_ro.u8[i]);
  }
  if (verbose)
    printf("- Testing short\n");
  for (i = 0; i < 256 / sizeof(short); i++) {
    val.s16 = ~tab_ro.s16[i];
    stm_store_short((volatile short *)&tab.s16[i], (short)val.s16);
    compare(i, val, TYPE_SHORT, 0);
    stm_store_short((volatile short *)&tab.s16[i], (short)tab_ro.s16[i]);
  }
  if (verbose)
    printf("- Testing unsigned short\n");
  for (i = 0; i < 256 / sizeof(unsigned short); i++) {
    val.u16 = ~tab_ro.u16[i];
    stm_store_ushort((volatile unsigned short *)&tab.u16[i], (unsigned short)val.u16);
    compare(i, val, TYPE_USHORT, 0);
    stm_store_ushort((volatile unsigned short *)&tab.u16[i], (unsigned short)tab_ro.u16[i]);
  }
  if (verbose)
    printf("- Testing int\n");
  for (i = 0; i < 256 / sizeof(int); i++) {
    val.s32 = ~tab_ro.s32[i];
    stm_store_int((volatile int *)&tab.s32[i], (int)val.s32);
    compare(i, val, TYPE_INT, 0);
    stm_store_int((volatile int *)&tab.s32[i], (int)tab_ro.s32[i]);
  }
  if (verbose)
    printf("- Testing unsigned int\n");
  for (i = 0; i < 256 / sizeof(unsigned int); i++) {
    val.u32 = ~tab_ro.u32[i];
    stm_store_uint((volatile unsigned int *)&tab.u32[i], (unsigned int)val.u32);
    compare(i, val, TYPE_UINT, 0);
    stm_store_uint((volatile unsigned int *)&tab.u32[i], (unsigned int)tab_ro.u32[i]);
  }
  if (verbose)
    printf("- Testing long\n");
  for (i = 0; i < 256 / sizeof(long); i++) {
    if (sizeof(long) == 4) {
      val.s32 = ~tab_ro.s32[i];
      stm_store_long((volatile long *)&tab.s32[i], (long)val.s32);
      compare(i, val, TYPE_LONG, 0);
      stm_store_long((volatile long *)&tab.s32[i], (long)tab_ro.s32[i]);
    } else {
      val.s64 = ~tab_ro.s64[i];
      stm_store_long((volatile long *)&tab.s64[i], (long)val.s64);
      compare(i, val, TYPE_LONG, 0);
      stm_store_long((volatile long *)&tab.s64[i], (long)tab_ro.s64[i]);
    }
  }
  if (verbose)
    printf("- Testing unsigned long\n");
  for (i = 0; i < 256 / sizeof(unsigned long); i++) {
    if (sizeof(long) == 4) {
      val.u32 = ~tab_ro.u32[i];
      stm_store_ulong((volatile unsigned long *)&tab.u32[i], (unsigned long)val.u32);
      compare(i, val, TYPE_ULONG, 0);
      stm_store_ulong((volatile unsigned long *)&tab.u32[i], (unsigned long)tab_ro.u32[i]);
    } else {
      val.s64 = ~tab_ro.s64[i];
      stm_store_long((volatile long *)&tab.s64[i], (long)val.s64);
      compare(i, val, TYPE_LONG, 0);
      stm_store_long((volatile long *)&tab.s64[i], (long)tab_ro.s64[i]);
    }
  }
  if (verbose)
    printf("- Testing float\n");
  for (i = 0; i < 256 / sizeof(float); i++) {
    val.u32 = ~tab_ro.u32[i];
    stm_store_float(&tab.f[i], val.f);
    compare(i, val, TYPE_FLOAT, 0);
    stm_store_float(&tab.f[i], tab_ro.f[i]);
  }
  if (verbose)
    printf("- Testing double\n");
  for (i = 0; i < 256 / sizeof(double); i++) {
    val.u64 = ~tab_ro.u64[i];
    stm_store_double(&tab.d[i], val.d);
    compare(i, val, TYPE_DOUBLE, 0);
    stm_store_double(&tab.d[i], tab_ro.d[i]);
  }

  stm_commit();
}

static void *test(void *v)
{
  unsigned int seed;
  int nested, store;
  sigjmp_buf *e;

  seed = (unsigned int)time(NULL);
  stm_init_thread();
  while (stop == 0) {
    nested = (rand_r(&seed) < RAND_MAX / 3);
    store = (rand_r(&seed) < RAND_MAX / 3);
    if (nested) {
      e = stm_start((stm_tx_attr_t)0);
      if (e != NULL)
        sigsetjmp(*e, 0);
    }
    if (store)
      test_stores();
    else
      test_loads();
    if (nested) {
      stm_commit();
    }
  }
  stm_exit_thread();

  return NULL;
}

int main(int argc, char **argv)
{
  int i;
  pthread_t *threads;
  pthread_attr_t attr;
  struct timespec timeout;

  for (i = 0; i < 256; i++)
    tab_ro.u8[i] = tab.u8[i] = i;

  /* Init STM */
  printf("Initializing STM\n");
  stm_init();

  printf("int/long/ptr/word size: %d/%d/%d/%d\n",
         (int)sizeof(int),
         (int)sizeof(long),
         (int)sizeof(void *),
         (int)sizeof(stm_word_t));

  verbose = 1;
  stop = 0;

  stm_init_thread();

  printf("TESTING LOADS...\n");
  test_loads();
  printf("PASSED\n");

  printf("TESTING STORES...\n");
  test_stores();
  printf("PASSED\n");

  stm_exit_thread();

  printf("TESTING CONCURRENT LOADS AND STORES...\n");
  verbose = 0;
  timeout.tv_sec = DURATION / 1000;
  timeout.tv_nsec = (DURATION % 1000) * 1000000;
  if ((threads = (pthread_t *)malloc(NB_THREADS * sizeof(pthread_t))) == NULL) {
    perror("malloc");
    exit(1);
  }
  pthread_attr_init(&attr);
  pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
  for (i = 0; i < NB_THREADS; i++) {
    if (pthread_create(&threads[i], &attr, test, NULL) != 0) {
      fprintf(stderr, "Error creating thread\n");
      exit(1);
    }
  }
  pthread_attr_destroy(&attr);
  nanosleep(&timeout, NULL);
  printf("STOPPING...\n");
  stop = 1;
  for (i = 0; i < NB_THREADS; i++) {
    if (pthread_join(threads[i], NULL) != 0) {
      fprintf(stderr, "Error waiting for thread completion\n");
      exit(1);
    }
  }
  printf("PASSED\n");

  /* Cleanup STM */
  stm_exit();

  return 0;
}